blob: 710249c17b19745630aa5125fde4e0cd708b1d7a [file] [log] [blame]
Alex Deymo48ad5ab2017-09-13 22:17:57 +02001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "bsdiff/suffix_array_index.h"
6
Jordan R Abrahams-Whitehead59d92932022-04-06 21:48:53 +00007#include <algorithm>
Alex Deymo48ad5ab2017-09-13 22:17:57 +02008#include <limits>
9#include <vector>
10
11#include <divsufsort.h>
12#include <divsufsort64.h>
13
14#include "bsdiff/logging.h"
15
Alex Deymo48ad5ab2017-09-13 22:17:57 +020016namespace {
17
18// libdivsufsort C++ overloaded functions used to allow calling the right
19// implementation based on the pointer size.
20int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
21 return divsufsort(text, sa, n);
22}
23int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
24 return divsufsort64(text, sa, n);
25}
26
27saidx_t CallSaSearch(const uint8_t* text,
28 size_t text_size,
29 const uint8_t* pattern,
30 size_t pattern_size,
31 const saidx_t* sa,
32 size_t sa_size,
33 saidx_t* left) {
34 return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
35}
36saidx64_t CallSaSearch(const uint8_t* text,
37 size_t text_size,
38 const uint8_t* pattern,
39 size_t pattern_size,
40 const saidx64_t* sa,
41 size_t sa_size,
42 saidx64_t* left) {
43 return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
44}
45
46} // namespace
47
48namespace bsdiff {
49
50// The SAIDX template type must be either saidx_t or saidx64_t, which will
51// depend on the maximum size of the input text needed.
52template <typename SAIDX>
53class SuffixArrayIndex : public SuffixArrayIndexInterface {
54 public:
55 SuffixArrayIndex() = default;
56
57 // Initialize and construct the suffix array of the |text| string of length
58 // |n|. The memory pointed by |text| must be kept alive. Returns whether the
59 // construction succeeded.
60 bool Init(const uint8_t* text, size_t n);
61
62 // SuffixArrayIndexInterface overrides.
63 void SearchPrefix(const uint8_t* target,
64 size_t length,
65 size_t* out_length,
66 uint64_t* out_pos) const override;
67
68 private:
69 const uint8_t* text_{nullptr}; // Owned by the caller.
70 size_t n_{0};
71
72 std::vector<SAIDX> sa_;
73};
74
75template <typename SAIDX>
76bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
77 if (!sa_.empty()) {
78 // Already initialized.
Tianjie Xu18480eb2017-11-29 16:21:43 -080079 LOG(ERROR) << "SuffixArray already initialized";
Alex Deymo48ad5ab2017-09-13 22:17:57 +020080 return false;
81 }
82 if (static_cast<uint64_t>(n) >
83 static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
Tianjie Xu18480eb2017-11-29 16:21:43 -080084 LOG(ERROR) << "Input too big (" << n << ") for this implementation";
Alex Deymo48ad5ab2017-09-13 22:17:57 +020085 return false;
86 }
87 text_ = text;
88 n_ = n;
89 sa_.resize(n + 1);
90
91 if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
Tianjie Xu18480eb2017-11-29 16:21:43 -080092 LOG(ERROR) << "divsufsrot() failed";
Alex Deymo48ad5ab2017-09-13 22:17:57 +020093 return false;
94 }
95
96 return true;
97}
98
99template <typename SAIDX>
100void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
101 size_t length,
102 size_t* out_length,
103 uint64_t* out_pos) const {
104 SAIDX suf_left;
105 SAIDX count =
106 CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
107 if (count > 0) {
108 // This is the simple case where we found the whole |target| string was
109 // found.
110 *out_pos = sa_[suf_left];
111 *out_length = length;
112 return;
113 }
114 // In this case, |suf_left| points to the first suffix array position such
115 // that the suffix at that position is lexicographically larger than |target|.
116 // We only need to check whether the previous entry or the current entry is a
117 // longer match.
118 size_t prev_suffix_len = 0;
119 if (suf_left > 0) {
120 const size_t prev_max_len =
121 std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
122 const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
123 prev_suffix_len =
124 std::mismatch(target, target + prev_max_len, prev_suffix).first -
125 target;
126 }
127
128 size_t next_suffix_len = 0;
129 if (static_cast<size_t>(suf_left) < n_) {
130 const uint8_t* next_suffix = text_ + sa_[suf_left];
131 const size_t next_max_len =
132 std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
133 next_suffix_len =
134 std::mismatch(target, target + next_max_len, next_suffix).first -
135 target;
136 }
137
138 *out_length = std::max(next_suffix_len, prev_suffix_len);
139 if (!*out_length) {
140 *out_pos = 0;
141 } else if (next_suffix_len >= prev_suffix_len) {
142 *out_pos = sa_[suf_left];
143 } else {
144 *out_pos = sa_[suf_left - 1];
145 }
146}
147
148std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
149 const uint8_t* text,
150 size_t n) {
151 // The maximum supported size when using the suffix array based on the 32-bit
152 // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
153 // than the maximum representable number so references like "n + 1" are don't
154 // overflow.
155 const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
156 std::unique_ptr<SuffixArrayIndexInterface> ret;
157
158 if (n > kMaxSaidxSize) {
159 SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
160 ret.reset(sa_ptr);
161 if (!sa_ptr->Init(text, n))
162 return nullptr;
163 } else {
164 SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
165 ret.reset(sa_ptr);
166 if (!sa_ptr->Init(text, n))
167 return nullptr;
168 }
169 return ret;
170}
171
172
173} // namespace bsdiff