devtools_file_system_indexer.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. // Copyright 2013 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #include "brightray/browser/devtools_file_system_indexer.h"
  5. #include <stddef.h>
  6. #include <algorithm>
  7. #include <iterator>
  8. #include <set>
  9. #include "base/bind.h"
  10. #include "base/files/file_enumerator.h"
  11. #include "base/files/file_util.h"
  12. #include "base/files/file_util_proxy.h"
  13. #include "base/lazy_instance.h"
  14. #include "base/logging.h"
  15. #include "base/stl_util.h"
  16. #include "base/strings/string_util.h"
  17. #include "base/strings/utf_string_conversions.h"
  18. #include "content/public/browser/browser_thread.h"
  19. using base::Bind;
  20. using base::Callback;
  21. using base::FileEnumerator;
  22. using base::FilePath;
  23. using base::Time;
  24. using base::TimeDelta;
  25. using base::TimeTicks;
  26. using content::BrowserThread;
  27. using std::map;
  28. using std::set;
  29. using std::string;
  30. using std::vector;
  31. namespace brightray {
  32. namespace {
  33. typedef int32_t Trigram;
  34. typedef char TrigramChar;
  35. typedef uint16_t FileId;
  36. const int kMinTimeoutBetweenWorkedNotification = 200;
  37. // Trigram characters include all ASCII printable characters (32-126) except for
  38. // the capital letters, because the index is case insensitive.
  39. const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
  40. const size_t kTrigramCount =
  41. kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
  42. const int kMaxReadLength = 10 * 1024;
  43. const TrigramChar kUndefinedTrigramChar = -1;
  44. const TrigramChar kBinaryTrigramChar = -2;
  45. const Trigram kUndefinedTrigram = -1;
  46. template <typename Char>
  47. bool IsAsciiUpper(Char c) {
  48. return c >= 'A' && c <= 'Z';
  49. }
  50. class Index {
  51. public:
  52. Index();
  53. Time LastModifiedTimeForFile(const FilePath& file_path);
  54. void SetTrigramsForFile(const FilePath& file_path,
  55. const vector<Trigram>& index,
  56. const Time& time);
  57. vector<FilePath> Search(string query);
  58. void PrintStats();
  59. void NormalizeVectors();
  60. private:
  61. ~Index();
  62. FileId GetFileId(const FilePath& file_path);
  63. typedef map<FilePath, FileId> FileIdsMap;
  64. FileIdsMap file_ids_;
  65. FileId last_file_id_;
  66. // The index in this vector is the trigram id.
  67. vector<vector<FileId>> index_;
  68. typedef map<FilePath, Time> IndexedFilesMap;
  69. IndexedFilesMap index_times_;
  70. vector<bool> is_normalized_;
  71. DISALLOW_COPY_AND_ASSIGN(Index);
  72. };
  73. base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
  74. TrigramChar TrigramCharForChar(char c) {
  75. static TrigramChar* trigram_chars = nullptr;
  76. if (!trigram_chars) {
  77. trigram_chars = new TrigramChar[256];
  78. for (size_t i = 0; i < 256; ++i) {
  79. if (i > 127) {
  80. trigram_chars[i] = kUndefinedTrigramChar;
  81. continue;
  82. }
  83. char ch = static_cast<char>(i);
  84. if (ch == '\t')
  85. ch = ' ';
  86. if (IsAsciiUpper(ch))
  87. ch = ch - 'A' + 'a';
  88. bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
  89. if (is_binary_char) {
  90. trigram_chars[i] = kBinaryTrigramChar;
  91. continue;
  92. }
  93. if (ch < ' ') {
  94. trigram_chars[i] = kUndefinedTrigramChar;
  95. continue;
  96. }
  97. if (ch >= 'Z')
  98. ch = ch - 'Z' - 1 + 'A';
  99. ch -= ' ';
  100. char signed_trigram_count = static_cast<char>(kTrigramCharacterCount);
  101. CHECK(ch >= 0 && ch < signed_trigram_count);
  102. trigram_chars[i] = ch;
  103. }
  104. }
  105. unsigned char uc = static_cast<unsigned char>(c);
  106. return trigram_chars[uc];
  107. }
  108. Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
  109. static int kTrigramCharacterCountSquared =
  110. kTrigramCharacterCount * kTrigramCharacterCount;
  111. if (trigram_chars[index] == kUndefinedTrigramChar ||
  112. trigram_chars[index + 1] == kUndefinedTrigramChar ||
  113. trigram_chars[index + 2] == kUndefinedTrigramChar)
  114. return kUndefinedTrigram;
  115. Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
  116. kTrigramCharacterCount * trigram_chars[index + 1] +
  117. trigram_chars[index + 2];
  118. return trigram;
  119. }
  120. Index::Index() : last_file_id_(0) {
  121. index_.resize(kTrigramCount);
  122. is_normalized_.resize(kTrigramCount);
  123. std::fill(is_normalized_.begin(), is_normalized_.end(), true);
  124. }
  125. Index::~Index() {}
  126. Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
  127. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  128. Time last_modified_time;
  129. if (index_times_.find(file_path) != index_times_.end())
  130. last_modified_time = index_times_[file_path];
  131. return last_modified_time;
  132. }
  133. void Index::SetTrigramsForFile(const FilePath& file_path,
  134. const vector<Trigram>& index,
  135. const Time& time) {
  136. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  137. FileId file_id = GetFileId(file_path);
  138. auto it = index.begin();
  139. for (; it != index.end(); ++it) {
  140. Trigram trigram = *it;
  141. index_[trigram].push_back(file_id);
  142. is_normalized_[trigram] = false;
  143. }
  144. index_times_[file_path] = time;
  145. }
  146. vector<FilePath> Index::Search(string query) {
  147. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  148. const char* data = query.c_str();
  149. vector<TrigramChar> trigram_chars;
  150. trigram_chars.reserve(query.size());
  151. for (size_t i = 0; i < query.size(); ++i) {
  152. TrigramChar trigram_char = TrigramCharForChar(data[i]);
  153. if (trigram_char == kBinaryTrigramChar)
  154. trigram_char = kUndefinedTrigramChar;
  155. trigram_chars.push_back(trigram_char);
  156. }
  157. vector<Trigram> trigrams;
  158. for (size_t i = 0; i + 2 < query.size(); ++i) {
  159. Trigram trigram = TrigramAtIndex(trigram_chars, i);
  160. if (trigram != kUndefinedTrigram)
  161. trigrams.push_back(trigram);
  162. }
  163. set<FileId> file_ids;
  164. bool first = true;
  165. vector<Trigram>::const_iterator it = trigrams.begin();
  166. for (; it != trigrams.end(); ++it) {
  167. Trigram trigram = *it;
  168. if (first) {
  169. std::copy(index_[trigram].begin(), index_[trigram].end(),
  170. std::inserter(file_ids, file_ids.begin()));
  171. first = false;
  172. continue;
  173. }
  174. set<FileId> intersection =
  175. base::STLSetIntersection<set<FileId>>(file_ids, index_[trigram]);
  176. file_ids.swap(intersection);
  177. }
  178. vector<FilePath> result;
  179. FileIdsMap::const_iterator ids_it = file_ids_.begin();
  180. for (; ids_it != file_ids_.end(); ++ids_it) {
  181. if (trigrams.empty() || file_ids.find(ids_it->second) != file_ids.end()) {
  182. result.push_back(ids_it->first);
  183. }
  184. }
  185. return result;
  186. }
  187. FileId Index::GetFileId(const FilePath& file_path) {
  188. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  189. string file_path_str = file_path.AsUTF8Unsafe();
  190. if (file_ids_.find(file_path) != file_ids_.end())
  191. return file_ids_[file_path];
  192. file_ids_[file_path] = ++last_file_id_;
  193. return last_file_id_;
  194. }
  195. void Index::NormalizeVectors() {
  196. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  197. for (size_t i = 0; i < kTrigramCount; ++i) {
  198. if (!is_normalized_[i]) {
  199. std::sort(index_[i].begin(), index_[i].end());
  200. if (index_[i].capacity() > index_[i].size())
  201. vector<FileId>(index_[i]).swap(index_[i]);
  202. is_normalized_[i] = true;
  203. }
  204. }
  205. }
  206. void Index::PrintStats() {
  207. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  208. LOG(ERROR) << "Index stats:";
  209. size_t size = 0;
  210. size_t maxSize = 0;
  211. size_t capacity = 0;
  212. for (size_t i = 0; i < kTrigramCount; ++i) {
  213. if (index_[i].size() > maxSize)
  214. maxSize = index_[i].size();
  215. size += index_[i].size();
  216. capacity += index_[i].capacity();
  217. }
  218. LOG(ERROR) << " - total trigram count: " << size;
  219. LOG(ERROR) << " - max file count per trigram: " << maxSize;
  220. LOG(ERROR) << " - total vectors capacity " << capacity;
  221. size_t total_index_size =
  222. capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
  223. LOG(ERROR) << " - estimated total index size " << total_index_size;
  224. }
  225. typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
  226. } // namespace
  227. DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
  228. const FilePath& file_system_path,
  229. const TotalWorkCallback& total_work_callback,
  230. const WorkedCallback& worked_callback,
  231. const DoneCallback& done_callback)
  232. : file_system_path_(file_system_path),
  233. total_work_callback_(total_work_callback),
  234. worked_callback_(worked_callback),
  235. done_callback_(done_callback),
  236. current_file_(
  237. BrowserThread::GetTaskRunnerForThread(BrowserThread::FILE).get()),
  238. files_indexed_(0),
  239. stopped_(false) {
  240. current_trigrams_set_.resize(kTrigramCount);
  241. current_trigrams_.reserve(kTrigramCount);
  242. }
  243. DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
  244. void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
  245. DCHECK_CURRENTLY_ON(BrowserThread::UI);
  246. BrowserThread::PostTask(
  247. BrowserThread::FILE, FROM_HERE,
  248. BindOnce(&FileSystemIndexingJob::CollectFilesToIndex, this));
  249. }
  250. void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
  251. DCHECK_CURRENTLY_ON(BrowserThread::UI);
  252. BrowserThread::PostTask(
  253. BrowserThread::FILE, FROM_HERE,
  254. BindOnce(&FileSystemIndexingJob::StopOnFileThread, this));
  255. }
  256. void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
  257. stopped_ = true;
  258. }
  259. void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
  260. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  261. if (stopped_)
  262. return;
  263. if (!file_enumerator_) {
  264. file_enumerator_.reset(
  265. new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
  266. }
  267. FilePath file_path = file_enumerator_->Next();
  268. if (file_path.empty()) {
  269. BrowserThread::PostTask(
  270. BrowserThread::UI, FROM_HERE,
  271. BindOnce(total_work_callback_, file_path_times_.size()));
  272. indexing_it_ = file_path_times_.begin();
  273. IndexFiles();
  274. return;
  275. }
  276. Time saved_last_modified_time =
  277. g_trigram_index.Get().LastModifiedTimeForFile(file_path);
  278. FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
  279. Time current_last_modified_time = file_info.GetLastModifiedTime();
  280. if (current_last_modified_time > saved_last_modified_time) {
  281. file_path_times_[file_path] = current_last_modified_time;
  282. }
  283. BrowserThread::PostTask(
  284. BrowserThread::FILE, FROM_HERE,
  285. BindOnce(&FileSystemIndexingJob::CollectFilesToIndex, this));
  286. }
  287. void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
  288. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  289. if (stopped_)
  290. return;
  291. if (indexing_it_ == file_path_times_.end()) {
  292. g_trigram_index.Get().NormalizeVectors();
  293. BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
  294. return;
  295. }
  296. FilePath file_path = indexing_it_->first;
  297. current_file_.CreateOrOpen(
  298. file_path, base::File::FLAG_OPEN | base::File::FLAG_READ,
  299. Bind(&FileSystemIndexingJob::StartFileIndexing, this));
  300. }
  301. void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
  302. base::File::Error error) {
  303. if (!current_file_.IsValid()) {
  304. FinishFileIndexing(false);
  305. return;
  306. }
  307. current_file_offset_ = 0;
  308. current_trigrams_.clear();
  309. std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
  310. ReadFromFile();
  311. }
  312. void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
  313. if (stopped_) {
  314. CloseFile();
  315. return;
  316. }
  317. current_file_.Read(current_file_offset_, kMaxReadLength,
  318. Bind(&FileSystemIndexingJob::OnRead, this));
  319. }
  320. void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
  321. base::File::Error error,
  322. const char* data,
  323. int bytes_read) {
  324. if (error != base::File::FILE_OK) {
  325. FinishFileIndexing(false);
  326. return;
  327. }
  328. if (!bytes_read || bytes_read < 3) {
  329. FinishFileIndexing(true);
  330. return;
  331. }
  332. size_t size = static_cast<size_t>(bytes_read);
  333. vector<TrigramChar> trigram_chars;
  334. trigram_chars.reserve(size);
  335. for (size_t i = 0; i < size; ++i) {
  336. TrigramChar trigram_char = TrigramCharForChar(data[i]);
  337. if (trigram_char == kBinaryTrigramChar) {
  338. current_trigrams_.clear();
  339. FinishFileIndexing(true);
  340. return;
  341. }
  342. trigram_chars.push_back(trigram_char);
  343. }
  344. for (size_t i = 0; i + 2 < size; ++i) {
  345. Trigram trigram = TrigramAtIndex(trigram_chars, i);
  346. if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
  347. current_trigrams_set_[trigram] = true;
  348. current_trigrams_.push_back(trigram);
  349. }
  350. }
  351. current_file_offset_ += bytes_read - 2;
  352. ReadFromFile();
  353. }
  354. void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
  355. bool success) {
  356. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  357. CloseFile();
  358. if (success) {
  359. FilePath file_path = indexing_it_->first;
  360. g_trigram_index.Get().SetTrigramsForFile(file_path, current_trigrams_,
  361. file_path_times_[file_path]);
  362. }
  363. ReportWorked();
  364. ++indexing_it_;
  365. IndexFiles();
  366. }
  367. void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
  368. if (current_file_.IsValid())
  369. current_file_.Close(Bind(&FileSystemIndexingJob::CloseCallback, this));
  370. }
  371. void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
  372. base::File::Error error) {}
  373. void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
  374. TimeTicks current_time = TimeTicks::Now();
  375. bool should_send_worked_nitification = true;
  376. if (!last_worked_notification_time_.is_null()) {
  377. TimeDelta delta = current_time - last_worked_notification_time_;
  378. if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNotification)
  379. should_send_worked_nitification = false;
  380. }
  381. ++files_indexed_;
  382. if (should_send_worked_nitification) {
  383. last_worked_notification_time_ = current_time;
  384. BrowserThread::PostTask(BrowserThread::UI, FROM_HERE,
  385. BindOnce(worked_callback_, files_indexed_));
  386. files_indexed_ = 0;
  387. }
  388. }
  389. DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {}
  390. DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
  391. scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
  392. DevToolsFileSystemIndexer::IndexPath(
  393. const string& file_system_path,
  394. const TotalWorkCallback& total_work_callback,
  395. const WorkedCallback& worked_callback,
  396. const DoneCallback& done_callback) {
  397. DCHECK_CURRENTLY_ON(BrowserThread::UI);
  398. scoped_refptr<FileSystemIndexingJob> indexing_job = new FileSystemIndexingJob(
  399. FilePath::FromUTF8Unsafe(file_system_path), total_work_callback,
  400. worked_callback, done_callback);
  401. indexing_job->Start();
  402. return indexing_job;
  403. }
  404. void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
  405. const string& query,
  406. const SearchCallback& callback) {
  407. DCHECK_CURRENTLY_ON(BrowserThread::UI);
  408. BrowserThread::PostTask(
  409. BrowserThread::FILE, FROM_HERE,
  410. BindOnce(&DevToolsFileSystemIndexer::SearchInPathOnFileThread, this,
  411. file_system_path, query, callback));
  412. }
  413. void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
  414. const string& file_system_path,
  415. const string& query,
  416. const SearchCallback& callback) {
  417. DCHECK_CURRENTLY_ON(BrowserThread::FILE);
  418. vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
  419. vector<string> result;
  420. FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
  421. vector<FilePath>::const_iterator it = file_paths.begin();
  422. for (; it != file_paths.end(); ++it) {
  423. if (path.IsParent(*it))
  424. result.push_back(it->AsUTF8Unsafe());
  425. }
  426. BrowserThread::PostTask(BrowserThread::UI, FROM_HERE,
  427. BindOnce(callback, result));
  428. }
  429. } // namespace brightray