1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_FUZZER_CORPUS
13 #define LLVM_FUZZER_CORPUS
15 #include "FuzzerDataFlowTrace.h"
16 #include "FuzzerDefs.h"
18 #include "FuzzerRandom.h"
19 #include "FuzzerSHA1.h"
20 #include "FuzzerTracePC.h"
24 #include <unordered_set>
29 Unit U; // The actual input data.
30 uint8_t Sha1[kSHA1NumBytes]; // Checksum.
31 // Number of features that this input has and no smaller input has.
32 size_t NumFeatures = 0;
33 size_t Tmp = 0; // Used by ValidateFeatureSet.
35 size_t NumExecutedMutations = 0;
36 size_t NumSuccessfullMutations = 0;
37 bool MayDeleteFile = false;
39 bool HasFocusFunction = false;
40 Vector<uint32_t> UniqFeatureSet;
41 Vector<uint8_t> DataFlowTraceForFocusFunction;
45 static const size_t kFeatureSetSize = 1 << 21;
47 InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) {
48 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
49 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
52 for (auto II : Inputs)
55 size_t size() const { return Inputs.size(); }
56 size_t SizeInBytes() const {
58 for (auto II : Inputs)
62 size_t NumActiveUnits() const {
64 for (auto II : Inputs)
65 Res += !II->U.empty();
68 size_t MaxInputSize() const {
70 for (auto II : Inputs)
71 Res = std::max(Res, II->U.size());
75 size_t NumInputsThatTouchFocusFunction() {
76 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
77 return II->HasFocusFunction;
81 size_t NumInputsWithDataFlowTrace() {
82 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
83 return !II->DataFlowTraceForFocusFunction.empty();
87 bool empty() const { return Inputs.empty(); }
88 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
89 void AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
90 bool HasFocusFunction, const Vector<uint32_t> &FeatureSet,
91 const DataFlowTrace &DFT, const InputInfo *BaseII) {
94 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
95 Inputs.push_back(new InputInfo());
96 InputInfo &II = *Inputs.back();
98 II.NumFeatures = NumFeatures;
99 II.MayDeleteFile = MayDeleteFile;
100 II.UniqFeatureSet = FeatureSet;
101 II.HasFocusFunction = HasFocusFunction;
102 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
103 ComputeSHA1(U.data(), U.size(), II.Sha1);
104 auto Sha1Str = Sha1ToString(II.Sha1);
105 Hashes.insert(Sha1Str);
106 if (HasFocusFunction)
107 if (auto V = DFT.Get(Sha1Str))
108 II.DataFlowTraceForFocusFunction = *V;
109 // This is a gross heuristic.
110 // Ideally, when we add an element to a corpus we need to know its DFT.
111 // But if we don't, we'll use the DFT of its base input.
112 if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
113 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
114 UpdateCorpusDistribution();
116 // ValidateFeatureSet();
120 void PrintUnit(const Unit &U) {
121 if (!FeatureDebug) return;
122 for (uint8_t C : U) {
123 if (C != 'F' && C != 'U' && C != 'Z')
130 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) {
131 if (!FeatureDebug) return;
133 for (uint32_t Feature: FeatureSet)
134 Printf("%u,", Feature);
140 if (!FeatureDebug) return;
141 Printf("======= CORPUS:\n");
143 for (auto II : Inputs) {
144 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
146 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
149 PrintFeatureSet(II->UniqFeatureSet);
156 void Replace(InputInfo *II, const Unit &U) {
157 assert(II->U.size() > U.size());
158 Hashes.erase(Sha1ToString(II->Sha1));
160 ComputeSHA1(U.data(), U.size(), II->Sha1);
161 Hashes.insert(Sha1ToString(II->Sha1));
164 UpdateCorpusDistribution();
167 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
168 bool HasUnit(const std::string &H) { return Hashes.count(H); }
169 InputInfo &ChooseUnitToMutate(Random &Rand) {
170 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
171 assert(!II.U.empty());
175 // Returns an index of random unit from the corpus to mutate.
176 size_t ChooseUnitIdxToMutate(Random &Rand) {
177 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
178 assert(Idx < Inputs.size());
183 for (size_t i = 0; i < Inputs.size(); i++) {
184 const auto &II = *Inputs[i];
185 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
186 Sha1ToString(II.Sha1).c_str(), II.U.size(),
187 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction);
191 void PrintFeatureSet() {
192 for (size_t i = 0; i < kFeatureSetSize; i++) {
193 if(size_t Sz = GetFeature(i))
194 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
197 for (size_t i = 0; i < Inputs.size(); i++)
198 if (size_t N = Inputs[i]->NumFeatures)
199 Printf(" %zd=>%zd ", i, N);
203 void DeleteFile(const InputInfo &II) {
204 if (!OutputCorpus.empty() && II.MayDeleteFile)
205 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
208 void DeleteInput(size_t Idx) {
209 InputInfo &II = *Inputs[Idx];
213 Printf("EVICTED %zd\n", Idx);
216 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
218 Idx = Idx % kFeatureSetSize;
219 uint32_t OldSize = GetFeature(Idx);
220 if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
222 size_t OldIdx = SmallestElementPerFeature[Idx];
223 InputInfo &II = *Inputs[OldIdx];
224 assert(II.NumFeatures > 0);
226 if (II.NumFeatures == 0)
231 NumUpdatedFeatures++;
233 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
234 SmallestElementPerFeature[Idx] = Inputs.size();
235 InputSizesPerFeature[Idx] = NewSize;
241 size_t NumFeatures() const { return NumAddedFeatures; }
242 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
246 static const bool FeatureDebug = false;
248 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
250 void ValidateFeatureSet() {
253 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
255 Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
256 for (auto II: Inputs) {
257 if (II->Tmp != II->NumFeatures)
258 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
259 assert(II->Tmp == II->NumFeatures);
264 // Updates the probability distribution for the units in the corpus.
265 // Must be called whenever the corpus or unit weights are changed.
267 // Hypothesis: units added to the corpus last are more interesting.
269 // Hypothesis: inputs with infrequent features are more interesting.
270 void UpdateCorpusDistribution() {
271 size_t N = Inputs.size();
273 Intervals.resize(N + 1);
275 std::iota(Intervals.begin(), Intervals.end(), 0);
276 for (size_t i = 0; i < N; i++)
277 Weights[i] = Inputs[i]->NumFeatures
278 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1)
281 for (size_t i = 0; i < N; i++)
282 Printf("%zd ", Inputs[i]->NumFeatures);
284 for (size_t i = 0; i < N; i++)
285 Printf("%f ", Weights[i]);
288 CorpusDistribution = std::piecewise_constant_distribution<double>(
289 Intervals.begin(), Intervals.end(), Weights.begin());
291 std::piecewise_constant_distribution<double> CorpusDistribution;
293 Vector<double> Intervals;
294 Vector<double> Weights;
296 std::unordered_set<std::string> Hashes;
297 Vector<InputInfo*> Inputs;
299 size_t NumAddedFeatures = 0;
300 size_t NumUpdatedFeatures = 0;
301 uint32_t InputSizesPerFeature[kFeatureSetSize];
302 uint32_t SmallestElementPerFeature[kFeatureSetSize];
304 std::string OutputCorpus;
307 } // namespace fuzzer
309 #endif // LLVM_FUZZER_CORPUS