blob: 201f90a4f5d18a8f0804e4b657a0a9c63118a161 [file] [log] [blame]
Samuel Huang06f1ae92018-03-13 18:19:34 +00001// Copyright 2017 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
6#define COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
7
8#include <stddef.h>
9#include <stdint.h>
10
11#include <memory>
12#include <string>
13
Samuel Huang06f1ae92018-03-13 18:19:34 +000014#include "components/zucchini/buffer_view.h"
15
16namespace zucchini {
17
18// A class to detect outliers in a list of doubles using Chauvenet's criterion:
19// Compute mean and standard deviation of observations, then determine whether
20// a query value lies beyond a fixed number of standard deviations (sigmas) from
21// the mean. The purpose of this test is to reduce the chance of false-positive
22// ensemble matches.
23class OutlierDetector {
24 public:
25 OutlierDetector();
Samuel Huangf137bf42021-08-13 15:42:26 +000026 OutlierDetector(const OutlierDetector&) = delete;
27 const OutlierDetector& operator=(const OutlierDetector&) = delete;
Samuel Huang06f1ae92018-03-13 18:19:34 +000028 ~OutlierDetector();
29
30 // Incorporates |sample| into mean and standard deviation.
31 void Add(double sample);
32
33 // Prepares basic statistics for DecideOutlier() calls. Should be called after
34 // all samples have been added.
35 void Prepare();
36
37 // Renders current statistics as strings for logging.
38 std::string RenderStats();
39
40 // Heuristically decides whether |sample| is an outlier. Returns 1 if |sample|
41 // is "too high", 0 if |sample| is "normal", and -1 if |sample| is "too low".
42 // Must be called after Prepare().
43 int DecideOutlier(double sample);
44
45 private:
46 size_t n_ = 0;
47 double sum_ = 0;
48 double sum_of_squares_ = 0;
49 double mean_ = 0;
50 double standard_deviation_ = 0;
Samuel Huang06f1ae92018-03-13 18:19:34 +000051};
52
53// A class to compute similarity score between binary data. The heuristic here
54// preprocesses input data to a size-65536 histogram, counting the frequency of
55// consecutive 2-byte sequences. Therefore data with lengths < 2 are considered
56// invalid -- but this is okay for Zucchini's use case.
57class BinaryDataHistogram {
58 public:
59 BinaryDataHistogram();
Samuel Huangf137bf42021-08-13 15:42:26 +000060 BinaryDataHistogram(const BinaryDataHistogram&) = delete;
61 const BinaryDataHistogram& operator=(const BinaryDataHistogram&) = delete;
Samuel Huang06f1ae92018-03-13 18:19:34 +000062 ~BinaryDataHistogram();
63
64 // Attempts to compute the histogram, returns true iff successful.
65 bool Compute(ConstBufferView region);
66
67 bool IsValid() const { return static_cast<bool>(histogram_); }
68
69 // Returns distance to another histogram (heuristics). If two binaries are
70 // identical then their histogram distance is 0. However, the converse is not
71 // true in general. For example, "aba" and "bab" are different, but their
72 // histogram distance is 0 (both histograms are {"ab": 1, "ba": 1}).
73 double Distance(const BinaryDataHistogram& other) const;
74
75 private:
76 enum { kNumBins = 1 << (sizeof(uint16_t) * 8) };
77 static_assert(kNumBins == 65536, "Incorrect constant computation.");
78
79 // Size, in bytes, of the data over which the histogram was computed.
80 size_t size_ = 0;
81
82 // 2^16 buckets holding counts of all 2-byte sequences in the data. The counts
83 // are stored as signed values to simplify computing the distance between two
84 // histograms.
85 std::unique_ptr<int32_t[]> histogram_;
Samuel Huang06f1ae92018-03-13 18:19:34 +000086};
87
88} // namespace zucchini
89
90#endif // COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_