1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
11 #ifndef LLVM_FUZZER_CORPUS
12 #define LLVM_FUZZER_CORPUS
14 #include "FuzzerDataFlowTrace.h"
15 #include "FuzzerDefs.h"
17 #include "FuzzerRandom.h"
18 #include "FuzzerSHA1.h"
19 #include "FuzzerTracePC.h"
23 #include <unordered_set>
28 Unit U; // The actual input data.
29 uint8_t Sha1[kSHA1NumBytes]; // Checksum.
30 // Number of features that this input has and no smaller input has.
31 size_t NumFeatures = 0;
32 size_t Tmp = 0; // Used by ValidateFeatureSet.
34 size_t NumExecutedMutations = 0;
35 size_t NumSuccessfullMutations = 0;
36 bool MayDeleteFile = false;
38 bool HasFocusFunction = false;
39 Vector<uint32_t> UniqFeatureSet;
40 Vector<uint8_t> DataFlowTraceForFocusFunction;
44 static const size_t kFeatureSetSize = 1 << 21;
46 InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) {
47 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
48 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
51 for (auto II : Inputs)
54 size_t size() const { return Inputs.size(); }
55 size_t SizeInBytes() const {
57 for (auto II : Inputs)
61 size_t NumActiveUnits() const {
63 for (auto II : Inputs)
64 Res += !II->U.empty();
67 size_t MaxInputSize() const {
69 for (auto II : Inputs)
70 Res = std::max(Res, II->U.size());
74 size_t NumInputsThatTouchFocusFunction() {
75 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
76 return II->HasFocusFunction;
80 size_t NumInputsWithDataFlowTrace() {
81 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
82 return !II->DataFlowTraceForFocusFunction.empty();
86 bool empty() const { return Inputs.empty(); }
87 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
88 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
89 bool HasFocusFunction,
90 const Vector<uint32_t> &FeatureSet,
91 const DataFlowTrace &DFT, const InputInfo *BaseII) {
94 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
95 Inputs.push_back(new InputInfo());
96 InputInfo &II = *Inputs.back();
98 II.NumFeatures = NumFeatures;
99 II.MayDeleteFile = MayDeleteFile;
100 II.UniqFeatureSet = FeatureSet;
101 II.HasFocusFunction = HasFocusFunction;
102 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
103 ComputeSHA1(U.data(), U.size(), II.Sha1);
104 auto Sha1Str = Sha1ToString(II.Sha1);
105 Hashes.insert(Sha1Str);
106 if (HasFocusFunction)
107 if (auto V = DFT.Get(Sha1Str))
108 II.DataFlowTraceForFocusFunction = *V;
109 // This is a gross heuristic.
110 // Ideally, when we add an element to a corpus we need to know its DFT.
111 // But if we don't, we'll use the DFT of its base input.
112 if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
113 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
114 UpdateCorpusDistribution();
116 // ValidateFeatureSet();
121 void PrintUnit(const Unit &U) {
122 if (!FeatureDebug) return;
123 for (uint8_t C : U) {
124 if (C != 'F' && C != 'U' && C != 'Z')
131 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) {
132 if (!FeatureDebug) return;
134 for (uint32_t Feature: FeatureSet)
135 Printf("%u,", Feature);
141 if (!FeatureDebug) return;
142 Printf("======= CORPUS:\n");
144 for (auto II : Inputs) {
145 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
147 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
150 PrintFeatureSet(II->UniqFeatureSet);
157 void Replace(InputInfo *II, const Unit &U) {
158 assert(II->U.size() > U.size());
159 Hashes.erase(Sha1ToString(II->Sha1));
161 ComputeSHA1(U.data(), U.size(), II->Sha1);
162 Hashes.insert(Sha1ToString(II->Sha1));
165 UpdateCorpusDistribution();
168 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
169 bool HasUnit(const std::string &H) { return Hashes.count(H); }
170 InputInfo &ChooseUnitToMutate(Random &Rand) {
171 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
172 assert(!II.U.empty());
176 // Returns an index of random unit from the corpus to mutate.
177 size_t ChooseUnitIdxToMutate(Random &Rand) {
178 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
179 assert(Idx < Inputs.size());
184 for (size_t i = 0; i < Inputs.size(); i++) {
185 const auto &II = *Inputs[i];
186 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
187 Sha1ToString(II.Sha1).c_str(), II.U.size(),
188 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction);
192 void PrintFeatureSet() {
193 for (size_t i = 0; i < kFeatureSetSize; i++) {
194 if(size_t Sz = GetFeature(i))
195 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
198 for (size_t i = 0; i < Inputs.size(); i++)
199 if (size_t N = Inputs[i]->NumFeatures)
200 Printf(" %zd=>%zd ", i, N);
204 void DeleteFile(const InputInfo &II) {
205 if (!OutputCorpus.empty() && II.MayDeleteFile)
206 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
209 void DeleteInput(size_t Idx) {
210 InputInfo &II = *Inputs[Idx];
214 Printf("EVICTED %zd\n", Idx);
217 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
219 Idx = Idx % kFeatureSetSize;
220 uint32_t OldSize = GetFeature(Idx);
221 if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
223 size_t OldIdx = SmallestElementPerFeature[Idx];
224 InputInfo &II = *Inputs[OldIdx];
225 assert(II.NumFeatures > 0);
227 if (II.NumFeatures == 0)
232 NumUpdatedFeatures++;
234 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
235 SmallestElementPerFeature[Idx] = Inputs.size();
236 InputSizesPerFeature[Idx] = NewSize;
242 size_t NumFeatures() const { return NumAddedFeatures; }
243 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
247 static const bool FeatureDebug = false;
249 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
251 void ValidateFeatureSet() {
254 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
256 Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
257 for (auto II: Inputs) {
258 if (II->Tmp != II->NumFeatures)
259 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
260 assert(II->Tmp == II->NumFeatures);
265 // Updates the probability distribution for the units in the corpus.
266 // Must be called whenever the corpus or unit weights are changed.
268 // Hypothesis: units added to the corpus last are more interesting.
270 // Hypothesis: inputs with infrequent features are more interesting.
271 void UpdateCorpusDistribution() {
272 size_t N = Inputs.size();
274 Intervals.resize(N + 1);
276 std::iota(Intervals.begin(), Intervals.end(), 0);
277 for (size_t i = 0; i < N; i++)
278 Weights[i] = Inputs[i]->NumFeatures
279 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1)
282 for (size_t i = 0; i < N; i++)
283 Printf("%zd ", Inputs[i]->NumFeatures);
285 for (size_t i = 0; i < N; i++)
286 Printf("%f ", Weights[i]);
289 CorpusDistribution = std::piecewise_constant_distribution<double>(
290 Intervals.begin(), Intervals.end(), Weights.begin());
292 std::piecewise_constant_distribution<double> CorpusDistribution;
294 Vector<double> Intervals;
295 Vector<double> Weights;
297 std::unordered_set<std::string> Hashes;
298 Vector<InputInfo*> Inputs;
300 size_t NumAddedFeatures = 0;
301 size_t NumUpdatedFeatures = 0;
302 uint32_t InputSizesPerFeature[kFeatureSetSize];
303 uint32_t SmallestElementPerFeature[kFeatureSetSize];
305 std::string OutputCorpus;
308 } // namespace fuzzer
310 #endif // LLVM_FUZZER_CORPUS