1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
11 #ifndef LLVM_FUZZER_CORPUS
12 #define LLVM_FUZZER_CORPUS
14 #include "FuzzerDataFlowTrace.h"
15 #include "FuzzerDefs.h"
17 #include "FuzzerRandom.h"
18 #include "FuzzerSHA1.h"
19 #include "FuzzerTracePC.h"
23 #include <unordered_set>
28 Unit U; // The actual input data.
29 uint8_t Sha1[kSHA1NumBytes]; // Checksum.
30 // Number of features that this input has and no smaller input has.
31 size_t NumFeatures = 0;
32 size_t Tmp = 0; // Used by ValidateFeatureSet.
34 size_t NumExecutedMutations = 0;
35 size_t NumSuccessfullMutations = 0;
36 bool MayDeleteFile = false;
38 bool HasFocusFunction = false;
39 Vector<uint32_t> UniqFeatureSet;
40 Vector<uint8_t> DataFlowTraceForFocusFunction;
42 bool NeedsEnergyUpdate = false;
44 size_t SumIncidence = 0;
45 Vector<std::pair<uint32_t, uint16_t>> FeatureFreqs;
47 // Delete feature Idx and its frequency from FeatureFreqs.
48 bool DeleteFeatureFreq(uint32_t Idx) {
49 if (FeatureFreqs.empty())
52 // Binary search over local feature frequencies sorted by index.
53 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
54 std::pair<uint32_t, uint16_t>(Idx, 0));
56 if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
57 FeatureFreqs.erase(Lower);
63 // Assign more energy to a high-entropy seed, i.e., that reveals more
64 // information about the globally rare features in the neighborhood
65 // of the seed. Since we do not know the entropy of a seed that has
66 // never been executed we assign fresh seeds maximum entropy and
67 // let II->Energy approach the true entropy from above.
68 void UpdateEnergy(size_t GlobalNumberOfFeatures) {
72 // Apply add-one smoothing to locally discovered features.
73 for (auto F : FeatureFreqs) {
74 size_t LocalIncidence = F.second + 1;
75 Energy -= LocalIncidence * logl(LocalIncidence);
76 SumIncidence += LocalIncidence;
79 // Apply add-one smoothing to locally undiscovered features.
80 // PreciseEnergy -= 0; // since logl(1.0) == 0)
81 SumIncidence += (GlobalNumberOfFeatures - FeatureFreqs.size());
83 // Add a single locally abundant feature apply add-one smoothing.
84 size_t AbdIncidence = NumExecutedMutations + 1;
85 Energy -= AbdIncidence * logl(AbdIncidence);
86 SumIncidence += AbdIncidence;
89 if (SumIncidence != 0)
90 Energy = (Energy / SumIncidence) + logl(SumIncidence);
93 // Increment the frequency of the feature Idx.
94 void UpdateFeatureFrequency(uint32_t Idx) {
95 NeedsEnergyUpdate = true;
97 // The local feature frequencies is an ordered vector of pairs.
98 // If there are no local feature frequencies, push_back preserves order.
99 // Set the feature frequency for feature Idx32 to 1.
100 if (FeatureFreqs.empty()) {
101 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1));
105 // Binary search over local feature frequencies sorted by index.
106 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
107 std::pair<uint32_t, uint16_t>(Idx, 0));
109 // If feature Idx32 already exists, increment its frequency.
110 // Otherwise, insert a new pair right after the next lower index.
111 if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
114 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1));
119 struct EntropicOptions {
121 size_t NumberOfRarestFeatures;
122 size_t FeatureFrequencyThreshold;
126 static const uint32_t kFeatureSetSize = 1 << 21;
127 static const uint8_t kMaxMutationFactor = 20;
128 static const size_t kSparseEnergyUpdates = 100;
130 size_t NumExecutedMutations = 0;
132 EntropicOptions Entropic;
135 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic)
136 : Entropic(Entropic), OutputCorpus(OutputCorpus) {
137 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
138 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
141 for (auto II : Inputs)
144 size_t size() const { return Inputs.size(); }
145 size_t SizeInBytes() const {
147 for (auto II : Inputs)
151 size_t NumActiveUnits() const {
153 for (auto II : Inputs)
154 Res += !II->U.empty();
157 size_t MaxInputSize() const {
159 for (auto II : Inputs)
160 Res = std::max(Res, II->U.size());
163 void IncrementNumExecutedMutations() { NumExecutedMutations++; }
165 size_t NumInputsThatTouchFocusFunction() {
166 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
167 return II->HasFocusFunction;
171 size_t NumInputsWithDataFlowTrace() {
172 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
173 return !II->DataFlowTraceForFocusFunction.empty();
177 bool empty() const { return Inputs.empty(); }
178 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
179 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
180 bool HasFocusFunction,
181 const Vector<uint32_t> &FeatureSet,
182 const DataFlowTrace &DFT, const InputInfo *BaseII) {
185 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
186 Inputs.push_back(new InputInfo());
187 InputInfo &II = *Inputs.back();
189 II.NumFeatures = NumFeatures;
190 II.MayDeleteFile = MayDeleteFile;
191 II.UniqFeatureSet = FeatureSet;
192 II.HasFocusFunction = HasFocusFunction;
193 // Assign maximal energy to the new seed.
194 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size());
195 II.SumIncidence = RareFeatures.size();
196 II.NeedsEnergyUpdate = false;
197 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
198 ComputeSHA1(U.data(), U.size(), II.Sha1);
199 auto Sha1Str = Sha1ToString(II.Sha1);
200 Hashes.insert(Sha1Str);
201 if (HasFocusFunction)
202 if (auto V = DFT.Get(Sha1Str))
203 II.DataFlowTraceForFocusFunction = *V;
204 // This is a gross heuristic.
205 // Ideally, when we add an element to a corpus we need to know its DFT.
206 // But if we don't, we'll use the DFT of its base input.
207 if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
208 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
209 DistributionNeedsUpdate = true;
211 // ValidateFeatureSet();
216 void PrintUnit(const Unit &U) {
217 if (!FeatureDebug) return;
218 for (uint8_t C : U) {
219 if (C != 'F' && C != 'U' && C != 'Z')
226 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) {
227 if (!FeatureDebug) return;
229 for (uint32_t Feature: FeatureSet)
230 Printf("%u,", Feature);
236 if (!FeatureDebug) return;
237 Printf("======= CORPUS:\n");
239 for (auto II : Inputs) {
240 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
242 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
245 PrintFeatureSet(II->UniqFeatureSet);
252 void Replace(InputInfo *II, const Unit &U) {
253 assert(II->U.size() > U.size());
254 Hashes.erase(Sha1ToString(II->Sha1));
256 ComputeSHA1(U.data(), U.size(), II->Sha1);
257 Hashes.insert(Sha1ToString(II->Sha1));
260 DistributionNeedsUpdate = true;
263 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
264 bool HasUnit(const std::string &H) { return Hashes.count(H); }
265 InputInfo &ChooseUnitToMutate(Random &Rand) {
266 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
267 assert(!II.U.empty());
271 // Returns an index of random unit from the corpus to mutate.
272 size_t ChooseUnitIdxToMutate(Random &Rand) {
273 UpdateCorpusDistribution(Rand);
274 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
275 assert(Idx < Inputs.size());
280 for (size_t i = 0; i < Inputs.size(); i++) {
281 const auto &II = *Inputs[i];
282 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
283 Sha1ToString(II.Sha1).c_str(), II.U.size(),
284 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction);
288 void PrintFeatureSet() {
289 for (size_t i = 0; i < kFeatureSetSize; i++) {
290 if(size_t Sz = GetFeature(i))
291 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
294 for (size_t i = 0; i < Inputs.size(); i++)
295 if (size_t N = Inputs[i]->NumFeatures)
296 Printf(" %zd=>%zd ", i, N);
300 void DeleteFile(const InputInfo &II) {
301 if (!OutputCorpus.empty() && II.MayDeleteFile)
302 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
305 void DeleteInput(size_t Idx) {
306 InputInfo &II = *Inputs[Idx];
310 II.NeedsEnergyUpdate = false;
311 DistributionNeedsUpdate = true;
313 Printf("EVICTED %zd\n", Idx);
316 void AddRareFeature(uint32_t Idx) {
317 // Maintain *at least* TopXRarestFeatures many rare features
318 // and all features with a frequency below ConsideredRare.
319 // Remove all other features.
320 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures &&
321 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) {
323 // Find most and second most abbundant feature.
324 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0],
327 for (size_t i = 0; i < RareFeatures.size(); i++) {
328 uint32_t Idx2 = RareFeatures[i];
329 if (GlobalFeatureFreqs[Idx2] >=
330 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) {
331 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0];
332 MostAbundantRareFeatureIndices[0] = Idx2;
337 // Remove most abundant rare feature.
338 RareFeatures[Delete] = RareFeatures.back();
339 RareFeatures.pop_back();
341 for (auto II : Inputs) {
342 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0]))
343 II->NeedsEnergyUpdate = true;
346 // Set 2nd most abundant as the new most abundant feature count.
347 FreqOfMostAbundantRareFeature =
348 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]];
351 // Add rare feature, handle collisions, and update energy.
352 RareFeatures.push_back(Idx);
353 GlobalFeatureFreqs[Idx] = 0;
354 for (auto II : Inputs) {
355 II->DeleteFeatureFreq(Idx);
357 // Apply add-one smoothing to this locally undiscovered feature.
358 // Zero energy seeds will never be fuzzed and remain zero energy.
359 if (II->Energy > 0.0) {
360 II->SumIncidence += 1;
361 II->Energy += logl(II->SumIncidence) / II->SumIncidence;
365 DistributionNeedsUpdate = true;
368 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
370 Idx = Idx % kFeatureSetSize;
371 uint32_t OldSize = GetFeature(Idx);
372 if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
374 size_t OldIdx = SmallestElementPerFeature[Idx];
375 InputInfo &II = *Inputs[OldIdx];
376 assert(II.NumFeatures > 0);
378 if (II.NumFeatures == 0)
382 if (Entropic.Enabled)
383 AddRareFeature((uint32_t)Idx);
385 NumUpdatedFeatures++;
387 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
388 SmallestElementPerFeature[Idx] = Inputs.size();
389 InputSizesPerFeature[Idx] = NewSize;
395 // Increment frequency of feature Idx globally and locally.
396 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) {
397 uint32_t Idx32 = Idx % kFeatureSetSize;
399 // Saturated increment.
400 if (GlobalFeatureFreqs[Idx32] == 0xFFFF)
402 uint16_t Freq = GlobalFeatureFreqs[Idx32]++;
405 if (Freq > FreqOfMostAbundantRareFeature ||
406 std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) ==
410 // Update global frequencies.
411 if (Freq == FreqOfMostAbundantRareFeature)
412 FreqOfMostAbundantRareFeature++;
414 // Update local frequencies.
416 II->UpdateFeatureFrequency(Idx32);
419 size_t NumFeatures() const { return NumAddedFeatures; }
420 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
424 static const bool FeatureDebug = false;
426 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
428 void ValidateFeatureSet() {
431 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
433 Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
434 for (auto II: Inputs) {
435 if (II->Tmp != II->NumFeatures)
436 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
437 assert(II->Tmp == II->NumFeatures);
442 // Updates the probability distribution for the units in the corpus.
443 // Must be called whenever the corpus or unit weights are changed.
445 // Hypothesis: inputs that maximize information about globally rare features
447 void UpdateCorpusDistribution(Random &Rand) {
448 // Skip update if no seeds or rare features were added/deleted.
449 // Sparse updates for local change of feature frequencies,
450 // i.e., randomly do not skip.
451 if (!DistributionNeedsUpdate &&
452 (!Entropic.Enabled || Rand(kSparseEnergyUpdates)))
455 DistributionNeedsUpdate = false;
457 size_t N = Inputs.size();
459 Intervals.resize(N + 1);
461 std::iota(Intervals.begin(), Intervals.end(), 0);
463 bool VanillaSchedule = true;
464 if (Entropic.Enabled) {
465 for (auto II : Inputs) {
466 if (II->NeedsEnergyUpdate && II->Energy != 0.0) {
467 II->NeedsEnergyUpdate = false;
468 II->UpdateEnergy(RareFeatures.size());
472 for (size_t i = 0; i < N; i++) {
474 if (Inputs[i]->NumFeatures == 0) {
475 // If the seed doesn't represent any features, assign zero energy.
477 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor >
478 NumExecutedMutations / Inputs.size()) {
479 // If the seed was fuzzed a lot more than average, assign zero energy.
482 // Otherwise, simply assign the computed energy.
483 Weights[i] = Inputs[i]->Energy;
486 // If energy for all seeds is zero, fall back to vanilla schedule.
487 if (Weights[i] > 0.0)
488 VanillaSchedule = false;
492 if (VanillaSchedule) {
493 for (size_t i = 0; i < N; i++)
494 Weights[i] = Inputs[i]->NumFeatures
495 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1)
500 for (size_t i = 0; i < N; i++)
501 Printf("%zd ", Inputs[i]->NumFeatures);
503 for (size_t i = 0; i < N; i++)
504 Printf("%f ", Weights[i]);
507 CorpusDistribution = std::piecewise_constant_distribution<double>(
508 Intervals.begin(), Intervals.end(), Weights.begin());
510 std::piecewise_constant_distribution<double> CorpusDistribution;
512 Vector<double> Intervals;
513 Vector<double> Weights;
515 std::unordered_set<std::string> Hashes;
516 Vector<InputInfo*> Inputs;
518 size_t NumAddedFeatures = 0;
519 size_t NumUpdatedFeatures = 0;
520 uint32_t InputSizesPerFeature[kFeatureSetSize];
521 uint32_t SmallestElementPerFeature[kFeatureSetSize];
523 bool DistributionNeedsUpdate = true;
524 uint16_t FreqOfMostAbundantRareFeature = 0;
525 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {};
526 Vector<uint32_t> RareFeatures;
528 std::string OutputCorpus;
531 } // namespace fuzzer
533 #endif // LLVM_FUZZER_CORPUS