From 35ca81cc77f77a0096c204d3a6ed266d47d990af Mon Sep 17 00:00:00 2001 From: Antonio Juarez Date: Thu, 31 Aug 2017 12:03:00 +0100 Subject: [PATCH 1/5] Bytecoin v.2.1.1 release --- ReleaseNotes.txt | 5 + external/rocksdb/.gitignore | 5 + external/rocksdb/.travis.yml | 42 +- external/rocksdb/CMakeLists.txt | 102 +- external/rocksdb/DEFAULT_OPTIONS_HISTORY.md | 14 + external/rocksdb/HISTORY.md | 125 +- external/rocksdb/INSTALL.md | 16 +- external/rocksdb/LANGUAGE-BINDINGS.md | 12 + external/rocksdb/LICENSE | 2 +- external/rocksdb/Makefile | 544 +- external/rocksdb/README.md | 2 + external/rocksdb/USERS.md | 38 +- external/rocksdb/appveyor.yml | 11 +- external/rocksdb/appveyordailytests.yml | 22 - .../arcanist_util/__phutil_library_map__.php | 93 +- .../config/FacebookArcanistConfiguration.php | 41 +- .../FacebookOldArcanistConfiguration.php | 36 + .../config/RocksDBCommonHelper.php | 328 + .../BaseDirectoryScopedFormatLinter.php | 2 +- .../cpp_linter/FbcodeCppLinter.php | 3 + .../arcanist_util/cpp_linter/cpplint.py | 4 +- .../lint_engine/FacebookFbcodeLintEngine.php | 12 +- .../FacebookFbcodeUnitTestEngine.php | 16 +- .../rocksdb/build_tools/build_detect_platform | 97 +- external/rocksdb/build_tools/fbcode_config.sh | 83 +- .../rocksdb/build_tools/fbcode_config4.8.1.sh | 79 +- .../rocksdb/build_tools/make_new_version.sh | 4 +- .../build_tools/regression_build_test.sh | 6 +- .../build_tools/rocksdb-lego-determinator | 367 +- .../rocksdb/build_tools/run_ci_db_test.ps1 | 240 +- .../rocksdb/{util => db}/auto_roll_logger.cc | 51 +- .../rocksdb/{util => db}/auto_roll_logger.h | 37 +- .../{util => db}/auto_roll_logger_test.cc | 120 +- external/rocksdb/db/builder.cc | 70 +- external/rocksdb/db/builder.h | 37 +- external/rocksdb/db/c.cc | 158 +- external/rocksdb/db/c_test.c | 44 +- external/rocksdb/db/column_family.cc | 263 +- external/rocksdb/db/column_family.h | 97 +- external/rocksdb/db/column_family_test.cc | 2192 ++- external/rocksdb/db/compact_files_test.cc | 120 +- external/rocksdb/db/compacted_db_impl.cc | 4 +- external/rocksdb/db/compacted_db_impl.h | 2 +- external/rocksdb/db/compaction.cc | 77 +- external/rocksdb/db/compaction.h | 67 +- external/rocksdb/db/compaction_iterator.cc | 168 +- external/rocksdb/db/compaction_iterator.h | 29 +- .../rocksdb/db/compaction_iterator_test.cc | 8 +- external/rocksdb/db/compaction_job.cc | 325 +- external/rocksdb/db/compaction_job.h | 23 +- .../rocksdb/db/compaction_job_stats_test.cc | 16 +- external/rocksdb/db/compaction_job_test.cc | 332 +- external/rocksdb/db/compaction_picker.cc | 342 +- external/rocksdb/db/compaction_picker.h | 88 +- external/rocksdb/db/compaction_picker_test.cc | 288 +- external/rocksdb/db/comparator_db_test.cc | 5 +- external/rocksdb/db/convenience.cc | 9 +- external/rocksdb/db/corruption_test.cc | 29 +- external/rocksdb/db/cuckoo_table_db_test.cc | 2 +- external/rocksdb/db/db_block_cache_test.cc | 496 + external/rocksdb/db/db_bloom_filter_test.cc | 1028 ++ .../rocksdb/db/db_compaction_filter_test.cc | 181 +- external/rocksdb/db/db_compaction_test.cc | 855 +- external/rocksdb/db/db_dynamic_level_test.cc | 80 +- external/rocksdb/db/db_filesnapshot.cc | 13 +- external/rocksdb/db/db_flush_test.cc | 56 + external/rocksdb/db/db_impl.cc | 2696 +++- external/rocksdb/db/db_impl.h | 376 +- external/rocksdb/db/db_impl_add_file.cc | 419 + external/rocksdb/db/db_impl_debug.cc | 72 +- external/rocksdb/db/db_impl_experimental.cc | 5 +- external/rocksdb/db/db_impl_readonly.cc | 18 +- external/rocksdb/db/db_impl_readonly.h | 2 +- .../rocksdb/{util => db}/db_info_dumper.cc | 5 +- .../rocksdb/{util => db}/db_info_dumper.h | 2 +- external/rocksdb/db/db_inplace_update_test.cc | 22 +- external/rocksdb/db/db_io_failure_test.cc | 259 + external/rocksdb/db/db_iter.cc | 403 +- external/rocksdb/db/db_iter.h | 28 +- external/rocksdb/db/db_iter_test.cc | 201 +- external/rocksdb/db/db_iterator_test.cc | 1606 ++ external/rocksdb/db/db_log_iter_test.cc | 12 +- external/rocksdb/db/db_options_test.cc | 131 + external/rocksdb/db/db_properties_test.cc | 1278 ++ external/rocksdb/db/db_sst_test.cc | 1733 ++ .../rocksdb/db/db_table_properties_test.cc | 260 + external/rocksdb/db/db_tailing_iter_test.cc | 69 +- external/rocksdb/db/db_test.cc | 13005 ++++++---------- external/rocksdb/db/db_test2.cc | 1841 +++ external/rocksdb/{util => db}/db_test_util.cc | 414 +- external/rocksdb/{util => db}/db_test_util.h | 279 +- .../db/db_universal_compaction_test.cc | 206 +- external/rocksdb/db/db_wal_test.cc | 890 +- external/rocksdb/db/dbformat.cc | 2 +- external/rocksdb/db/dbformat.h | 83 +- external/rocksdb/db/dbformat_test.cc | 26 +- external/rocksdb/db/deletefile_test.cc | 140 +- external/rocksdb/db/event_helpers.cc | 115 +- external/rocksdb/db/event_helpers.h | 21 +- external/rocksdb/db/experimental.cc | 2 +- external/rocksdb/db/fault_injection_test.cc | 423 +- external/rocksdb/db/file_indexer.cc | 2 +- external/rocksdb/db/file_indexer.h | 8 +- external/rocksdb/db/file_indexer_test.cc | 2 +- external/rocksdb/db/filename.cc | 46 +- external/rocksdb/db/filename.h | 18 +- external/rocksdb/db/filename_test.cc | 2 +- external/rocksdb/db/flush_job.cc | 176 +- external/rocksdb/db/flush_job.h | 35 +- external/rocksdb/db/flush_job_test.cc | 42 +- external/rocksdb/db/flush_scheduler.cc | 83 +- external/rocksdb/db/flush_scheduler.h | 32 +- external/rocksdb/db/forward_iterator.cc | 166 +- external/rocksdb/db/forward_iterator.h | 26 +- external/rocksdb/db/forward_iterator_bench.cc | 374 + external/rocksdb/db/inlineskiplist.h | 657 + external/rocksdb/db/inlineskiplist_test.cc | 475 + external/rocksdb/db/internal_stats.cc | 861 +- external/rocksdb/db/internal_stats.h | 239 +- external/rocksdb/db/job_context.h | 15 +- external/rocksdb/db/listener_test.cc | 517 +- external/rocksdb/db/log_format.h | 16 +- external/rocksdb/db/log_reader.cc | 188 +- external/rocksdb/db/log_reader.h | 36 +- external/rocksdb/db/log_test.cc | 274 +- external/rocksdb/db/log_writer.cc | 72 +- external/rocksdb/db/log_writer.h | 22 +- external/rocksdb/db/managed_iterator.cc | 11 +- external/rocksdb/db/managed_iterator.h | 2 +- .../{util => db}/manual_compaction_test.cc | 2 +- external/rocksdb/db/memtable.cc | 260 +- external/rocksdb/db/memtable.h | 121 +- external/rocksdb/db/memtable_allocator.cc | 55 +- external/rocksdb/db/memtable_allocator.h | 19 +- external/rocksdb/db/memtable_list.cc | 134 +- external/rocksdb/db/memtable_list.h | 7 +- external/rocksdb/db/memtable_list_test.cc | 24 +- external/rocksdb/db/memtablerep_bench.cc | 10 +- external/rocksdb/db/merge_context.h | 104 +- external/rocksdb/db/merge_helper.cc | 99 +- external/rocksdb/db/merge_helper.h | 36 +- external/rocksdb/db/merge_helper_test.cc | 2 +- external/rocksdb/db/merge_operator.cc | 35 +- external/rocksdb/db/merge_test.cc | 65 +- external/rocksdb/db/options_file_test.cc | 119 + external/rocksdb/db/perf_context_test.cc | 125 +- .../rocksdb/db/pinned_iterators_manager.h | 66 + external/rocksdb/db/plain_table_db_test.cc | 101 +- external/rocksdb/db/prefix_test.cc | 153 +- external/rocksdb/db/repair.cc | 400 +- external/rocksdb/db/repair_test.cc | 276 + external/rocksdb/db/skiplist.h | 24 +- external/rocksdb/db/skiplist_test.cc | 2 +- external/rocksdb/db/slice.cc | 24 - external/rocksdb/db/snapshot_impl.cc | 5 +- external/rocksdb/db/snapshot_impl.h | 25 +- external/rocksdb/db/table_cache.cc | 112 +- external/rocksdb/db/table_cache.h | 39 +- .../rocksdb/db/table_properties_collector.cc | 55 +- .../rocksdb/db/table_properties_collector.h | 17 +- .../db/table_properties_collector_test.cc | 24 +- external/rocksdb/db/transaction_log_impl.cc | 15 +- external/rocksdb/db/transaction_log_impl.h | 2 +- external/rocksdb/db/version_builder.cc | 37 +- external/rocksdb/db/version_builder.h | 7 +- external/rocksdb/db/version_builder_test.cc | 2 +- external/rocksdb/db/version_edit.cc | 155 +- external/rocksdb/db/version_edit.h | 9 +- external/rocksdb/db/version_edit_test.cc | 118 +- external/rocksdb/db/version_set.cc | 612 +- external/rocksdb/db/version_set.h | 109 +- external/rocksdb/db/version_set_test.cc | 4 +- external/rocksdb/db/wal_manager.cc | 17 +- external/rocksdb/db/wal_manager.h | 9 +- external/rocksdb/db/wal_manager_test.cc | 24 +- external/rocksdb/db/write_batch.cc | 710 +- external/rocksdb/db/write_batch_base.cc | 2 +- external/rocksdb/db/write_batch_internal.h | 73 +- external/rocksdb/db/write_batch_test.cc | 179 +- external/rocksdb/db/write_callback.h | 5 +- external/rocksdb/db/write_callback_test.cc | 228 +- external/rocksdb/db/write_controller.cc | 28 +- external/rocksdb/db/write_controller.h | 38 +- external/rocksdb/db/write_controller_test.cc | 35 +- external/rocksdb/db/write_thread.cc | 336 +- external/rocksdb/db/write_thread.h | 231 +- external/rocksdb/db/writebuffer.h | 44 - external/rocksdb/db/xfunc_test_points.cc | 145 + external/rocksdb/db/xfunc_test_points.h | 33 + external/rocksdb/doc/log_format.txt | 2 +- external/rocksdb/examples/Makefile | 13 +- external/rocksdb/examples/README.md | 3 +- .../examples/column_families_example.cc | 2 +- .../rocksdb/examples/compact_files_example.cc | 2 +- .../examples/compaction_filter_example.cc | 25 +- .../optimistic_transaction_example.cc | 2 +- .../rocksdb/examples/options_file_example.cc | 113 + .../examples/rocksdb_option_file_example.ini | 141 +- external/rocksdb/examples/simple_example.cc | 2 +- .../rocksdb/examples/transaction_example.cc | 2 +- external/rocksdb/hdfs/README | 6 +- external/rocksdb/hdfs/env_hdfs.h | 13 +- external/rocksdb/include/rocksdb/c.h | 44 +- external/rocksdb/include/rocksdb/cache.h | 68 +- .../include/rocksdb/compaction_filter.h | 34 +- .../include/rocksdb/compaction_job_stats.h | 4 +- external/rocksdb/include/rocksdb/comparator.h | 2 +- .../rocksdb/include/rocksdb/convenience.h | 41 +- external/rocksdb/include/rocksdb/db.h | 399 +- .../rocksdb/include/rocksdb/db_bench_tool.h | 9 + .../rocksdb/include/rocksdb/db_dump_tool.h | 2 +- .../include/rocksdb/delete_scheduler.h | 66 - external/rocksdb/include/rocksdb/env.h | 95 +- .../rocksdb/include/rocksdb/experimental.h | 2 +- .../rocksdb/include/rocksdb/filter_policy.h | 2 +- .../include/rocksdb/flush_block_policy.h | 2 +- .../include/rocksdb/immutable_options.h | 10 +- .../rocksdb/include/rocksdb/iostats_context.h | 4 +- external/rocksdb/include/rocksdb/iterator.h | 57 +- external/rocksdb/include/rocksdb/ldb_tool.h | 10 +- external/rocksdb/include/rocksdb/listener.h | 123 +- .../rocksdb/include/rocksdb/memtablerep.h | 42 +- .../rocksdb/include/rocksdb/merge_operator.h | 58 +- external/rocksdb/include/rocksdb/metadata.h | 2 +- external/rocksdb/include/rocksdb/options.h | 413 +- .../rocksdb/include/rocksdb/perf_context.h | 64 +- external/rocksdb/include/rocksdb/perf_level.h | 14 +- .../include/rocksdb/persistent_cache.h | 49 + .../rocksdb/include/rocksdb/rate_limiter.h | 2 +- external/rocksdb/include/rocksdb/slice.h | 20 +- .../rocksdb/include/rocksdb/slice_transform.h | 2 +- external/rocksdb/include/rocksdb/snapshot.h | 5 +- .../rocksdb/include/rocksdb/sst_dump_tool.h | 2 +- .../include/rocksdb/sst_file_manager.h | 80 + .../rocksdb/include/rocksdb/sst_file_writer.h | 5 +- external/rocksdb/include/rocksdb/statistics.h | 84 +- external/rocksdb/include/rocksdb/status.h | 50 +- external/rocksdb/include/rocksdb/table.h | 73 +- .../include/rocksdb/table_properties.h | 128 +- .../rocksdb/include/rocksdb/thread_status.h | 2 +- .../rocksdb/include/rocksdb/transaction_log.h | 2 +- external/rocksdb/include/rocksdb/types.h | 2 +- .../include/rocksdb/universal_compaction.h | 2 +- .../include/rocksdb/utilities/backupable_db.h | 149 +- .../include/rocksdb/utilities/checkpoint.h | 2 +- .../include/rocksdb/utilities/convenience.h | 2 +- .../include/rocksdb/utilities/db_ttl.h | 2 +- .../include/rocksdb/utilities/document_db.h | 2 +- .../include/rocksdb/utilities/env_librados.h | 186 + .../include/rocksdb/utilities/env_mirror.h | 166 + .../include/rocksdb/utilities/env_registry.h | 45 + .../include/rocksdb/utilities/flashcache.h | 2 +- .../include/rocksdb/utilities/geo_db.h | 23 +- .../rocksdb/utilities/info_log_finder.h | 2 +- .../include/rocksdb/utilities/json_document.h | 2 +- .../include/rocksdb/utilities/ldb_cmd.h | 251 + .../utilities}/ldb_cmd_execute_result.h | 2 +- .../rocksdb/utilities/leveldb_options.h | 4 +- .../include/rocksdb/utilities/memory_util.h | 50 + .../utilities/optimistic_transaction_db.h | 18 +- .../utilities/option_change_migration.h | 19 + .../include/rocksdb/utilities/options_util.h | 86 + .../include/rocksdb/utilities/sim_cache.h | 66 + .../include/rocksdb/utilities/spatial_db.h | 2 +- .../include/rocksdb/utilities/stackable_db.h | 41 +- .../utilities/table_properties_collectors.h | 2 +- .../include/rocksdb/utilities/transaction.h | 140 +- .../rocksdb/utilities/transaction_db.h | 25 +- .../rocksdb/utilities/transaction_db_mutex.h | 4 +- .../utilities/write_batch_with_index.h | 5 +- external/rocksdb/include/rocksdb/version.h | 6 +- external/rocksdb/include/rocksdb/wal_filter.h | 101 + .../rocksdb/include/rocksdb/write_batch.h | 71 +- .../include/rocksdb/write_batch_base.h | 2 +- .../include/rocksdb/write_buffer_manager.h | 62 + external/rocksdb/java/CMakeLists.txt | 165 + external/rocksdb/java/Makefile | 9 +- external/rocksdb/java/RELEASE.md | 2 +- .../org/rocksdb/benchmark/DbBenchmark.java | 62 +- external/rocksdb/java/crossbuild/Vagrantfile | 2 +- .../java/crossbuild/build-linux-centos.sh | 8 +- external/rocksdb/java/rocksjni.pom | 2 +- .../rocksdb/java/rocksjni/backupablejni.cc | 131 +- .../rocksdb/java/rocksjni/backupenginejni.cc | 28 +- external/rocksdb/java/rocksjni/checkpoint.cc | 2 +- .../java/rocksjni/columnfamilyhandle.cc | 2 +- .../java/rocksjni/compaction_filter.cc | 3 +- external/rocksdb/java/rocksjni/comparator.cc | 14 +- .../java/rocksjni/comparatorjnicallback.cc | 8 +- .../java/rocksjni/comparatorjnicallback.h | 2 +- external/rocksdb/java/rocksjni/env.cc | 2 +- external/rocksdb/java/rocksjni/filter.cc | 14 +- external/rocksdb/java/rocksjni/iterator.cc | 2 +- .../java/rocksjni/loggerjnicallback.cc | 65 +- .../rocksdb/java/rocksjni/loggerjnicallback.h | 8 +- external/rocksdb/java/rocksjni/memtablejni.cc | 2 +- external/rocksdb/java/rocksjni/options.cc | 434 +- external/rocksdb/java/rocksjni/portal.h | 109 +- .../rocksdb/java/rocksjni/ratelimiterjni.cc | 2 +- .../remove_emptyvalue_compactionfilterjni.cc | 15 +- external/rocksdb/java/rocksjni/restorejni.cc | 172 +- external/rocksdb/java/rocksjni/rocksjni.cc | 622 +- external/rocksdb/java/rocksjni/slice.cc | 50 +- external/rocksdb/java/rocksjni/snapshot.cc | 2 +- external/rocksdb/java/rocksjni/statistics.cc | 10 +- external/rocksdb/java/rocksjni/table.cc | 7 +- .../rocksdb/java/rocksjni/transaction_log.cc | 2 +- external/rocksdb/java/rocksjni/ttl.cc | 175 +- external/rocksdb/java/rocksjni/write_batch.cc | 141 +- .../rocksdb/java/rocksjni/write_batch_test.cc | 41 +- .../java/rocksjni/write_batch_with_index.cc | 219 +- .../rocksjni/writebatchhandlerjnicallback.cc | 2 +- .../rocksjni/writebatchhandlerjnicallback.h | 2 +- .../main/java/RocksDBColumnFamilySample.java | 84 +- .../samples/src/main/java/RocksDBSample.java | 512 +- .../org/rocksdb/AbstractCompactionFilter.java | 21 +- .../java/org/rocksdb/AbstractComparator.java | 18 +- .../AbstractImmutableNativeReference.java | 66 + .../org/rocksdb/AbstractNativeReference.java | 76 + .../org/rocksdb/AbstractRocksIterator.java | 25 +- .../main/java/org/rocksdb/AbstractSlice.java | 58 +- .../java/org/rocksdb/AbstractWriteBatch.java | 97 +- .../main/java/org/rocksdb/BackupEngine.java | 65 +- .../src/main/java/org/rocksdb/BackupInfo.java | 2 +- .../main/java/org/rocksdb/BackupableDB.java | 37 +- .../java/org/rocksdb/BackupableDBOptions.java | 121 +- .../org/rocksdb/BlockBasedTableConfig.java | 35 +- .../main/java/org/rocksdb/BloomFilter.java | 18 +- .../java/org/rocksdb/BuiltinComparator.java | 2 +- .../src/main/java/org/rocksdb/Checkpoint.java | 20 +- .../main/java/org/rocksdb/ChecksumType.java | 2 +- .../org/rocksdb/ColumnFamilyDescriptor.java | 2 +- .../java/org/rocksdb/ColumnFamilyHandle.java | 25 +- .../java/org/rocksdb/ColumnFamilyOptions.java | 138 +- .../rocksdb/ColumnFamilyOptionsInterface.java | 56 +- .../java/org/rocksdb/CompactionStyle.java | 2 +- .../src/main/java/org/rocksdb/Comparator.java | 14 +- .../java/org/rocksdb/ComparatorOptions.java | 16 +- .../java/org/rocksdb/CompressionType.java | 2 +- .../src/main/java/org/rocksdb/DBOptions.java | 155 +- .../java/org/rocksdb/DBOptionsInterface.java | 4 +- .../java/org/rocksdb/DirectComparator.java | 15 +- .../main/java/org/rocksdb/DirectSlice.java | 47 +- .../main/java/org/rocksdb/EncodingType.java | 2 +- .../java/src/main/java/org/rocksdb/Env.java | 6 +- .../src/main/java/org/rocksdb/Filter.java | 14 +- .../main/java/org/rocksdb/FlushOptions.java | 15 +- .../org/rocksdb/GenericRateLimiterConfig.java | 2 +- .../main/java/org/rocksdb/HistogramData.java | 2 +- .../main/java/org/rocksdb/HistogramType.java | 2 +- .../src/main/java/org/rocksdb/IndexType.java | 2 +- .../main/java/org/rocksdb/InfoLogLevel.java | 3 +- .../src/main/java/org/rocksdb/Logger.java | 19 +- .../main/java/org/rocksdb/MemTableConfig.java | 2 +- .../java/org/rocksdb/NativeLibraryLoader.java | 62 +- .../src/main/java/org/rocksdb/Options.java | 248 +- .../java/org/rocksdb/PlainTableConfig.java | 2 +- .../java/org/rocksdb/RateLimiterConfig.java | 2 +- .../main/java/org/rocksdb/ReadOptions.java | 201 +- .../src/main/java/org/rocksdb/ReadTier.java | 48 + .../RemoveEmptyValueCompactionFilter.java | 10 +- .../java/org/rocksdb/RestoreBackupableDB.java | 32 +- .../main/java/org/rocksdb/RestoreOptions.java | 29 +- .../src/main/java/org/rocksdb/RocksDB.java | 388 +- .../java/org/rocksdb/RocksDBException.java | 2 +- .../src/main/java/org/rocksdb/RocksEnv.java | 8 +- .../main/java/org/rocksdb/RocksIterator.java | 8 +- .../org/rocksdb/RocksIteratorInterface.java | 2 +- .../main/java/org/rocksdb/RocksMemEnv.java | 12 +- .../java/org/rocksdb/RocksMutableObject.java | 69 + .../main/java/org/rocksdb/RocksObject.java | 124 +- .../java/src/main/java/org/rocksdb/Slice.java | 24 +- .../src/main/java/org/rocksdb/Snapshot.java | 10 +- .../src/main/java/org/rocksdb/Statistics.java | 8 +- .../java/org/rocksdb/StatisticsCollector.java | 4 +- .../rocksdb/StatisticsCollectorCallback.java | 2 +- .../java/org/rocksdb/StatsCollectorInput.java | 2 +- .../java/org/rocksdb/TableFormatConfig.java | 2 +- .../src/main/java/org/rocksdb/TickerType.java | 2 +- .../org/rocksdb/TransactionLogIterator.java | 9 +- .../java/src/main/java/org/rocksdb/TtlDB.java | 70 +- .../java/org/rocksdb/WBWIRocksIterator.java | 51 +- .../src/main/java/org/rocksdb/WriteBatch.java | 64 +- .../java/org/rocksdb/WriteBatchInterface.java | 17 +- .../java/org/rocksdb/WriteBatchWithIndex.java | 113 +- .../main/java/org/rocksdb/WriteOptions.java | 13 +- .../org/rocksdb/util/BytewiseComparator.java | 91 + .../util/DirectBytewiseComparator.java | 88 + .../java/org/rocksdb/util/Environment.java | 15 +- .../util/ReverseBytewiseComparator.java | 37 + .../main/java/org/rocksdb/util/SizeUnit.java | 2 +- .../org/rocksdb/AbstractComparatorTest.java | 208 +- .../java/org/rocksdb/BackupEngineTest.java | 233 +- .../org/rocksdb/BackupableDBOptionsTest.java | 240 +- .../java/org/rocksdb/BackupableDBTest.java | 425 - .../rocksdb/BlockBasedTableConfigTest.java | 26 +- .../test/java/org/rocksdb/CheckPointTest.java | 91 +- .../org/rocksdb/ColumnFamilyOptionsTest.java | 477 +- .../java/org/rocksdb/ColumnFamilyTest.java | 1061 +- .../org/rocksdb/ComparatorOptionsTest.java | 11 +- .../test/java/org/rocksdb/ComparatorTest.java | 221 +- .../org/rocksdb/CompressionOptionsTest.java | 7 +- .../test/java/org/rocksdb/DBOptionsTest.java | 414 +- .../org/rocksdb/DirectComparatorTest.java | 2 +- .../java/org/rocksdb/DirectSliceTest.java | 72 +- .../src/test/java/org/rocksdb/FilterTest.java | 44 +- .../src/test/java/org/rocksdb/FlushTest.java | 58 +- .../java/org/rocksdb/InfoLogLevelTest.java | 67 +- .../java/org/rocksdb/KeyMayExistTest.java | 112 +- .../src/test/java/org/rocksdb/LoggerTest.java | 312 +- .../test/java/org/rocksdb/MemTableTest.java | 36 +- .../src/test/java/org/rocksdb/MergeTest.java | 361 +- .../java/org/rocksdb/MixedOptionsTest.java | 59 +- .../org/rocksdb/NativeLibraryLoaderTest.java | 16 +- .../test/java/org/rocksdb/OptionsTest.java | 761 +- .../org/rocksdb/PlainTableConfigTest.java | 12 +- .../org/rocksdb/PlatformRandomHelper.java | 4 +- .../test/java/org/rocksdb/ReadOnlyTest.java | 494 +- .../java/org/rocksdb/ReadOptionsTest.java | 170 +- .../test/java/org/rocksdb/RocksDBTest.java | 864 +- .../test/java/org/rocksdb/RocksEnvTest.java | 37 +- .../java/org/rocksdb/RocksIteratorTest.java | 70 +- .../java/org/rocksdb/RocksMemEnvTest.java | 164 +- .../java/org/rocksdb/RocksMemoryResource.java | 4 + .../src/test/java/org/rocksdb/SliceTest.java | 70 +- .../test/java/org/rocksdb/SnapshotTest.java | 290 +- .../org/rocksdb/StatisticsCollectorTest.java | 34 +- .../java/org/rocksdb/StatsCallbackMock.java | 2 +- .../rocksdb/TransactionLogIteratorTest.java | 210 +- .../src/test/java/org/rocksdb/TtlDBTest.java | 162 +- .../java/src/test/java/org/rocksdb/Types.java | 2 +- .../org/rocksdb/WriteBatchHandlerTest.java | 85 +- .../test/java/org/rocksdb/WriteBatchTest.java | 270 +- .../org/rocksdb/WriteBatchThreadedTest.java | 104 + .../org/rocksdb/WriteBatchWithIndexTest.java | 335 +- .../java/org/rocksdb/WriteOptionsTest.java | 23 +- .../org/rocksdb/test/RocksJunitRunner.java | 2 +- .../rocksdb/util/BytewiseComparatorTest.java | 480 + .../org/rocksdb/util/EnvironmentTest.java | 20 +- .../java/org/rocksdb/util/SizeUnitTest.java | 2 +- .../{util => memtable}/hash_cuckoo_rep.cc | 19 +- .../{util => memtable}/hash_cuckoo_rep.h | 5 +- .../{util => memtable}/hash_linklist_rep.cc | 4 +- .../{util => memtable}/hash_linklist_rep.h | 4 +- .../{util => memtable}/hash_skiplist_rep.cc | 4 +- .../{util => memtable}/hash_skiplist_rep.h | 4 +- .../rocksdb/{util => memtable}/skiplistrep.cc | 28 +- .../rocksdb/{util => memtable}/stl_wrappers.h | 14 +- .../rocksdb/{util => memtable}/vectorrep.cc | 4 +- external/rocksdb/port/dirent.h | 2 +- external/rocksdb/port/likely.h | 2 +- external/rocksdb/port/port.h | 3 +- external/rocksdb/port/port_example.h | 2 +- external/rocksdb/port/port_posix.cc | 24 +- external/rocksdb/port/port_posix.h | 34 +- external/rocksdb/port/stack_trace.cc | 6 +- external/rocksdb/port/stack_trace.h | 2 +- external/rocksdb/port/sys_time.h | 2 +- external/rocksdb/port/util_logger.h | 2 +- external/rocksdb/port/win/env_default.cc | 42 + external/rocksdb/port/win/env_win.cc | 2529 +-- external/rocksdb/port/win/env_win.h | 276 + external/rocksdb/port/win/io_win.cc | 963 ++ external/rocksdb/port/win/io_win.h | 359 + external/rocksdb/port/win/port_win.cc | 66 +- external/rocksdb/port/win/port_win.h | 145 +- external/rocksdb/port/win/win_logger.cc | 30 +- external/rocksdb/port/win/win_logger.h | 9 +- external/rocksdb/port/win/xpress_win.cc | 267 + external/rocksdb/port/win/xpress_win.h | 26 + external/rocksdb/port/xpress.h | 17 + external/rocksdb/src.mk | 109 +- .../rocksdb/table/adaptive_table_factory.cc | 31 +- .../rocksdb/table/adaptive_table_factory.h | 11 +- external/rocksdb/table/block.cc | 108 +- external/rocksdb/table/block.h | 79 +- .../rocksdb/table/block_based_filter_block.cc | 55 +- .../rocksdb/table/block_based_filter_block.h | 10 +- .../table/block_based_filter_block_test.cc | 14 +- .../table/block_based_table_builder.cc | 214 +- .../rocksdb/table/block_based_table_builder.h | 18 +- .../table/block_based_table_factory.cc | 66 +- .../rocksdb/table/block_based_table_factory.h | 25 +- .../rocksdb/table/block_based_table_reader.cc | 883 +- .../rocksdb/table/block_based_table_reader.h | 87 +- external/rocksdb/table/block_builder.cc | 52 +- external/rocksdb/table/block_builder.h | 9 +- external/rocksdb/table/block_hash_index.cc | 157 - external/rocksdb/table/block_hash_index.h | 85 - .../rocksdb/table/block_hash_index_test.cc | 120 - external/rocksdb/table/block_prefix_index.cc | 3 +- external/rocksdb/table/block_prefix_index.h | 2 +- external/rocksdb/table/block_test.cc | 32 +- external/rocksdb/table/bloom_block.cc | 2 +- external/rocksdb/table/bloom_block.h | 2 +- .../rocksdb/table/cuckoo_table_builder.cc | 15 +- external/rocksdb/table/cuckoo_table_builder.h | 6 +- .../table/cuckoo_table_builder_test.cc | 68 +- .../rocksdb/table/cuckoo_table_factory.cc | 10 +- external/rocksdb/table/cuckoo_table_factory.h | 17 +- external/rocksdb/table/cuckoo_table_reader.cc | 29 +- external/rocksdb/table/cuckoo_table_reader.h | 8 +- .../rocksdb/table/cuckoo_table_reader_test.cc | 20 +- external/rocksdb/table/filter_block.h | 19 +- external/rocksdb/table/flush_block_policy.cc | 24 +- external/rocksdb/table/format.cc | 202 +- external/rocksdb/table/format.h | 34 +- external/rocksdb/table/full_filter_block.cc | 20 +- external/rocksdb/table/full_filter_block.h | 9 +- .../rocksdb/table/full_filter_block_test.cc | 10 +- external/rocksdb/table/get_context.cc | 87 +- external/rocksdb/table/get_context.h | 33 +- external/rocksdb/table/internal_iterator.h | 104 + external/rocksdb/table/iter_heap.h | 2 +- external/rocksdb/table/iterator.cc | 67 +- external/rocksdb/table/iterator_wrapper.h | 58 +- external/rocksdb/table/merger.cc | 54 +- external/rocksdb/table/merger.h | 16 +- external/rocksdb/table/merger_test.cc | 8 +- external/rocksdb/table/meta_blocks.cc | 81 +- external/rocksdb/table/meta_blocks.h | 20 +- external/rocksdb/table/mock_table.cc | 20 +- external/rocksdb/table/mock_table.h | 27 +- .../rocksdb/table/persistent_cache_helper.cc | 112 + .../rocksdb/table/persistent_cache_helper.h | 63 + external/rocksdb/table/plain_table_builder.cc | 13 +- external/rocksdb/table/plain_table_builder.h | 14 +- external/rocksdb/table/plain_table_factory.cc | 39 +- external/rocksdb/table/plain_table_factory.h | 36 +- external/rocksdb/table/plain_table_index.cc | 7 +- external/rocksdb/table/plain_table_index.h | 2 +- .../rocksdb/table/plain_table_key_coding.cc | 95 +- .../rocksdb/table/plain_table_key_coding.h | 110 +- external/rocksdb/table/plain_table_reader.cc | 40 +- external/rocksdb/table/plain_table_reader.h | 14 +- .../{util => table}/scoped_arena_iterator.h | 17 +- external/rocksdb/table/sst_file_writer.cc | 46 +- external/rocksdb/table/table_builder.h | 27 +- external/rocksdb/table/table_properties.cc | 82 +- .../rocksdb/table/table_properties_internal.h | 11 +- external/rocksdb/table/table_reader.h | 15 +- external/rocksdb/table/table_reader_bench.cc | 30 +- external/rocksdb/table/table_test.cc | 533 +- external/rocksdb/table/two_level_iterator.cc | 58 +- external/rocksdb/table/two_level_iterator.h | 11 +- .../rocksdb/third-party/fbson/FbsonDocument.h | 2 +- .../third-party/fbson/FbsonJsonParser.h | 8 +- .../rocksdb/third-party/fbson/FbsonStream.h | 2 +- .../rocksdb/third-party/fbson/FbsonUtil.h | 2 +- .../rocksdb/third-party/fbson/FbsonWriter.h | 2 +- .../gtest-1.7.0/fused-src/gtest/gtest-all.cc | 8 +- .../gtest-1.7.0/fused-src/gtest/gtest.h | 4 +- external/rocksdb/thirdparty.inc | 98 +- external/rocksdb/tools/auto_sanity_test.sh | 6 +- external/rocksdb/tools/benchmark.sh | 192 +- .../rocksdb/tools/check_format_compatible.sh | 45 +- external/rocksdb/tools/db_bench.cc | 23 + .../db_bench.cc => tools/db_bench_tool.cc} | 855 +- external/rocksdb/tools/db_bench_tool_test.cc | 323 + external/rocksdb/tools/db_crashtest.py | 468 +- external/rocksdb/tools/db_crashtest2.py | 231 - external/rocksdb/tools/db_repl_stress.cc | 2 +- external/rocksdb/tools/db_sanity_test.cc | 7 +- external/rocksdb/tools/db_stress.cc | 242 +- external/rocksdb/tools/dump/db_dump_tool.cc | 2 +- external/rocksdb/tools/dump/rocksdb_dump.cc | 2 +- external/rocksdb/tools/dump/rocksdb_undump.cc | 2 +- external/rocksdb/tools/ldb.cc | 2 +- external/rocksdb/{util => tools}/ldb_cmd.cc | 1547 +- external/rocksdb/tools/ldb_cmd_impl.h | 493 + .../rocksdb/{util => tools}/ldb_cmd_test.cc | 30 +- external/rocksdb/tools/ldb_test.py | 84 +- external/rocksdb/tools/ldb_tool.cc | 124 + external/rocksdb/tools/rdb/API.md | 2 +- external/rocksdb/tools/reduce_levels_test.cc | 11 +- external/rocksdb/tools/regression_test.sh | 380 + external/rocksdb/tools/run_flash_bench.sh | 180 +- external/rocksdb/tools/sst_dump.cc | 2 +- .../rocksdb/{util => tools}/sst_dump_test.cc | 7 +- .../rocksdb/{util => tools}/sst_dump_tool.cc | 159 +- .../{util => tools}/sst_dump_tool_imp.h | 37 +- external/rocksdb/tools/write_stress.cc | 307 + external/rocksdb/tools/write_stress_runner.py | 73 + external/rocksdb/util/aligned_buffer.h | 2 +- external/rocksdb/util/allocator.h | 2 +- external/rocksdb/util/arena.cc | 6 +- external/rocksdb/util/arena.h | 5 +- external/rocksdb/util/arena_test.cc | 4 +- external/rocksdb/util/autovector.h | 2 +- external/rocksdb/util/autovector_test.cc | 51 +- external/rocksdb/util/bloom.cc | 2 +- external/rocksdb/util/bloom_test.cc | 2 +- external/rocksdb/util/build_version.h | 2 +- external/rocksdb/util/cache_bench.cc | 8 +- external/rocksdb/util/cache_test.cc | 87 +- external/rocksdb/util/channel.h | 2 +- external/rocksdb/util/coding.cc | 2 +- external/rocksdb/util/coding.h | 69 +- external/rocksdb/util/coding_test.cc | 2 +- .../rocksdb/util/compaction_job_stats_impl.cc | 2 +- external/rocksdb/util/comparator.cc | 38 +- external/rocksdb/util/compression.h | 246 +- external/rocksdb/util/concurrent_arena.cc | 50 + external/rocksdb/util/concurrent_arena.h | 200 + external/rocksdb/util/crc32c.cc | 11 +- external/rocksdb/util/crc32c.h | 2 +- external/rocksdb/util/crc32c_test.cc | 2 +- ..._scheduler_impl.cc => delete_scheduler.cc} | 100 +- ...te_scheduler_impl.h => delete_scheduler.h} | 28 +- .../rocksdb/util/delete_scheduler_test.cc | 93 +- external/rocksdb/util/dynamic_bloom.cc | 19 +- external/rocksdb/util/dynamic_bloom.h | 53 +- external/rocksdb/util/dynamic_bloom_test.cc | 162 +- external/rocksdb/util/env.cc | 44 +- external/rocksdb/util/env_basic_test.cc | 353 + external/rocksdb/util/env_chroot.cc | 299 + external/rocksdb/util/env_chroot.h | 22 + external/rocksdb/util/env_hdfs.cc | 14 +- external/rocksdb/util/env_posix.cc | 1182 +- external/rocksdb/util/env_test.cc | 634 +- external/rocksdb/util/event_logger.cc | 2 +- external/rocksdb/util/event_logger.h | 2 +- external/rocksdb/util/event_logger_test.cc | 2 +- .../rocksdb/util/fault_injection_test_env.cc | 312 + .../rocksdb/util/fault_injection_test_env.h | 158 + external/rocksdb/util/file_reader_writer.cc | 73 +- external/rocksdb/util/file_reader_writer.h | 15 +- .../rocksdb/util/file_reader_writer_test.cc | 47 +- external/rocksdb/util/file_util.cc | 37 +- external/rocksdb/util/file_util.h | 9 +- external/rocksdb/util/filelock_test.cc | 2 +- external/rocksdb/util/filter_policy.cc | 2 +- external/rocksdb/util/hash.cc | 2 +- external/rocksdb/util/hash.h | 4 +- external/rocksdb/util/heap.h | 2 +- external/rocksdb/util/heap_test.cc | 2 +- external/rocksdb/util/histogram.cc | 224 +- external/rocksdb/util/histogram.h | 115 +- external/rocksdb/util/histogram_test.cc | 198 +- external/rocksdb/util/histogram_windowing.cc | 194 + external/rocksdb/util/histogram_windowing.h | 80 + external/rocksdb/util/instrumented_mutex.cc | 26 +- external/rocksdb/util/instrumented_mutex.h | 2 +- external/rocksdb/util/io_posix.cc | 840 + external/rocksdb/util/io_posix.h | 228 + external/rocksdb/util/iostats_context.cc | 24 +- external/rocksdb/util/iostats_context_imp.h | 2 +- external/rocksdb/util/iostats_context_test.cc | 29 + external/rocksdb/util/kv_map.h | 31 + external/rocksdb/util/ldb_cmd.h | 769 - external/rocksdb/util/ldb_tool.cc | 125 - external/rocksdb/util/log_buffer.cc | 2 +- external/rocksdb/util/log_buffer.h | 2 +- external/rocksdb/util/log_write_bench.cc | 2 +- external/rocksdb/util/logging.cc | 2 +- external/rocksdb/util/logging.h | 2 +- .../rocksdb/util/{cache.cc => lru_cache.cc} | 320 +- external/rocksdb/util/memenv.cc | 16 +- external/rocksdb/util/mock_env.cc | 7 +- external/rocksdb/util/mock_env.h | 2 +- external/rocksdb/util/mock_env_test.cc | 209 +- external/rocksdb/util/murmurhash.cc | 2 +- external/rocksdb/util/murmurhash.h | 2 +- external/rocksdb/util/mutable_cf_options.cc | 18 +- external/rocksdb/util/mutable_cf_options.h | 50 +- external/rocksdb/util/mutexlock.h | 55 +- external/rocksdb/util/options.cc | 229 +- external/rocksdb/util/options_builder.cc | 206 - external/rocksdb/util/options_helper.cc | 878 +- external/rocksdb/util/options_helper.h | 279 +- external/rocksdb/util/options_parser.cc | 275 +- external/rocksdb/util/options_parser.h | 60 +- external/rocksdb/util/options_sanity_check.cc | 38 + external/rocksdb/util/options_sanity_check.h | 49 + .../rocksdb/util/options_settable_test.cc | 455 + external/rocksdb/util/options_test.cc | 892 +- external/rocksdb/util/perf_context.cc | 66 +- external/rocksdb/util/perf_context_imp.h | 9 +- external/rocksdb/util/perf_level.cc | 7 +- external/rocksdb/util/perf_level_imp.h | 2 +- external/rocksdb/util/perf_step_timer.h | 14 +- external/rocksdb/util/posix_logger.h | 13 +- external/rocksdb/util/random.cc | 38 + external/rocksdb/util/random.h | 29 +- external/rocksdb/util/rate_limiter.cc | 30 +- external/rocksdb/util/rate_limiter.h | 9 +- external/rocksdb/util/rate_limiter_test.cc | 63 +- external/rocksdb/util/sharded_cache.cc | 117 + external/rocksdb/util/sharded_cache.h | 93 + external/rocksdb/util/slice.cc | 80 +- external/rocksdb/util/slice_transform_test.cc | 2 +- .../rocksdb/util/sst_file_manager_impl.cc | 157 + external/rocksdb/util/sst_file_manager_impl.h | 95 + external/rocksdb/util/statistics.cc | 2 +- external/rocksdb/util/statistics.h | 10 +- external/rocksdb/util/statistics_test.cc | 35 + external/rocksdb/util/status.cc | 2 +- external/rocksdb/util/status_message.cc | 2 +- external/rocksdb/util/stderr_logger.h | 31 + external/rocksdb/util/stop_watch.h | 2 +- external/rocksdb/util/string_util.cc | 2 +- external/rocksdb/util/string_util.h | 4 +- external/rocksdb/util/sync_point.cc | 79 +- external/rocksdb/util/sync_point.h | 39 +- external/rocksdb/util/testharness.cc | 2 +- external/rocksdb/util/testharness.h | 2 +- external/rocksdb/util/testutil.cc | 232 +- external/rocksdb/util/testutil.h | 334 +- external/rocksdb/util/thread_list_test.cc | 2 +- external/rocksdb/util/thread_local.cc | 74 +- external/rocksdb/util/thread_local.h | 51 +- external/rocksdb/util/thread_local_test.cc | 67 +- external/rocksdb/util/thread_operation.h | 2 +- external/rocksdb/util/thread_status_impl.cc | 2 +- .../rocksdb/util/thread_status_updater.cc | 12 +- external/rocksdb/util/thread_status_updater.h | 2 +- .../util/thread_status_updater_debug.cc | 2 +- external/rocksdb/util/thread_status_util.cc | 37 +- external/rocksdb/util/thread_status_util.h | 15 +- .../rocksdb/util/thread_status_util_debug.cc | 2 +- external/rocksdb/util/threadpool.cc | 377 + external/rocksdb/util/threadpool.h | 110 + .../rocksdb/util/transaction_test_util.cc | 237 + external/rocksdb/util/transaction_test_util.h | 112 + external/rocksdb/util/xfunc.cc | 141 +- external/rocksdb/util/xfunc.h | 17 +- external/rocksdb/util/xxhash.cc | 1 + .../utilities/backupable/backupable_db.cc | 740 +- .../backupable/backupable_db_test.cc | 701 +- .../utilities/checkpoint/checkpoint.cc | 36 +- .../utilities/checkpoint/checkpoint_test.cc | 54 +- .../remove_emptyvalue_compactionfilter.cc | 2 +- .../remove_emptyvalue_compactionfilter.h | 2 +- .../utilities/convenience/info_log_finder.cc | 2 +- .../rocksdb/utilities/document/document_db.cc | 2 +- .../utilities/document/document_db_test.cc | 2 +- .../utilities/document/json_document.cc | 10 +- .../document/json_document_builder.cc | 9 +- .../utilities/document/json_document_test.cc | 4 +- external/rocksdb/utilities/env_librados.cc | 1498 ++ external/rocksdb/utilities/env_librados.md | 122 + .../rocksdb/utilities/env_librados_test.cc | 1146 ++ external/rocksdb/utilities/env_mirror.cc | 264 + .../env_mirror_test.cc} | 137 +- external/rocksdb/utilities/env_registry.cc | 47 + .../rocksdb/utilities/env_registry_test.cc | 72 + .../utilities/flashcache/flashcache.cc | 10 +- .../rocksdb/utilities/flashcache/flashcache.h | 2 +- .../rocksdb/utilities/geodb/geodb_impl.cc | 75 +- external/rocksdb/utilities/geodb/geodb_impl.h | 7 +- .../rocksdb/utilities/geodb/geodb_test.cc | 20 +- .../leveldb_options/leveldb_options.cc | 2 +- .../rocksdb/utilities/memory/memory_test.cc | 274 + .../rocksdb/utilities/memory/memory_util.cc | 52 + external/rocksdb/utilities/merge_operators.h | 8 +- .../rocksdb/utilities/merge_operators/max.cc | 75 + .../rocksdb/utilities/merge_operators/put.cc | 25 +- .../string_append/stringappend.cc | 3 - .../string_append/stringappend.h | 4 +- .../string_append/stringappend2.cc | 40 +- .../string_append/stringappend2.h | 7 +- .../utilities/merge_operators/uint64add.cc | 2 +- .../option_change_migration.cc | 153 + .../option_change_migration_test.cc | 207 + .../rocksdb/utilities/options/options_util.cc | 99 + .../utilities/options/options_util_test.cc | 311 + .../persistent_cache/block_cache_tier_file.cc | 575 + .../persistent_cache/block_cache_tier_file.h | 288 + .../block_cache_tier_file_buffer.h | 117 + .../block_cache_tier_metadata.cc | 83 + .../block_cache_tier_metadata.h | 124 + .../utilities/persistent_cache/hash_table.h | 238 + .../persistent_cache/hash_table_bench.cc | 303 + .../persistent_cache/hash_table_evictable.h | 166 + .../persistent_cache/hash_table_test.cc | 158 + .../utilities/persistent_cache/lrulist.h | 174 + .../persistent_cache/persistent_cache_test.cc | 56 + .../persistent_cache/persistent_cache_test.h | 417 + .../persistent_cache/persistent_cache_tier.cc | 111 + .../persistent_cache/persistent_cache_tier.h | 149 + .../persistent_cache/persistent_cache_util.h | 67 + .../persistent_cache/volatile_tier_impl.cc | 155 + .../persistent_cache/volatile_tier_impl.h | 141 + .../utilities/redis/redis_list_iterator.h | 7 +- .../utilities/redis/redis_lists_test.cc | 41 +- .../utilities/simulator_cache/sim_cache.cc | 165 + .../simulator_cache/sim_cache_test.cc | 145 + .../rocksdb/utilities/spatialdb/spatial_db.cc | 4 +- .../utilities/spatialdb/spatial_db_test.cc | 2 +- external/rocksdb/utilities/spatialdb/utils.h | 8 +- .../compact_on_deletion_collector.cc | 11 +- .../compact_on_deletion_collector.h | 6 +- .../compact_on_deletion_collector_test.cc | 15 +- .../optimistic_transaction_db_impl.cc | 27 +- .../optimistic_transaction_db_impl.h | 13 +- .../optimistic_transaction_impl.cc | 49 +- .../optimistic_transaction_impl.h | 23 +- .../optimistic_transaction_test.cc | 228 +- .../transactions/transaction_base.cc | 385 +- .../utilities/transactions/transaction_base.h | 111 +- .../transactions/transaction_db_impl.cc | 175 +- .../transactions/transaction_db_impl.h | 44 +- .../transactions/transaction_db_mutex_impl.cc | 44 +- .../transactions/transaction_db_mutex_impl.h | 2 +- .../transactions/transaction_impl.cc | 412 +- .../utilities/transactions/transaction_impl.h | 31 +- .../transactions/transaction_lock_mgr.cc | 22 +- .../transactions/transaction_lock_mgr.h | 8 +- .../transactions/transaction_test.cc | 4244 +++-- .../transactions/transaction_util.cc | 81 +- .../utilities/transactions/transaction_util.h | 26 +- external/rocksdb/utilities/ttl/db_ttl_impl.h | 49 +- .../utilities/util_merge_operators_test.cc | 99 + .../write_batch_with_index.cc | 48 +- .../write_batch_with_index_internal.cc | 56 +- .../write_batch_with_index_internal.h | 32 +- .../write_batch_with_index_test.cc | 2 +- src/CryptoNoteConfig.h | 7 +- src/version.h.in | 4 +- 819 files changed, 77732 insertions(+), 34823 deletions(-) create mode 100644 external/rocksdb/DEFAULT_OPTIONS_HISTORY.md create mode 100644 external/rocksdb/LANGUAGE-BINDINGS.md delete mode 100644 external/rocksdb/appveyordailytests.yml create mode 100644 external/rocksdb/arcanist_util/config/FacebookOldArcanistConfiguration.php create mode 100644 external/rocksdb/arcanist_util/config/RocksDBCommonHelper.php rename external/rocksdb/{util => db}/auto_roll_logger.cc (73%) rename external/rocksdb/{util => db}/auto_roll_logger.h (77%) rename external/rocksdb/{util => db}/auto_roll_logger_test.cc (75%) create mode 100644 external/rocksdb/db/db_block_cache_test.cc create mode 100644 external/rocksdb/db/db_bloom_filter_test.cc create mode 100644 external/rocksdb/db/db_flush_test.cc create mode 100644 external/rocksdb/db/db_impl_add_file.cc rename external/rocksdb/{util => db}/db_info_dumper.cc (97%) rename external/rocksdb/{util => db}/db_info_dumper.h (85%) create mode 100644 external/rocksdb/db/db_io_failure_test.cc create mode 100644 external/rocksdb/db/db_iterator_test.cc create mode 100644 external/rocksdb/db/db_options_test.cc create mode 100644 external/rocksdb/db/db_properties_test.cc create mode 100644 external/rocksdb/db/db_sst_test.cc create mode 100644 external/rocksdb/db/db_table_properties_test.cc create mode 100644 external/rocksdb/db/db_test2.cc rename external/rocksdb/{util => db}/db_test_util.cc (72%) rename external/rocksdb/{util => db}/db_test_util.h (70%) create mode 100644 external/rocksdb/db/forward_iterator_bench.cc create mode 100644 external/rocksdb/db/inlineskiplist.h create mode 100644 external/rocksdb/db/inlineskiplist_test.cc rename external/rocksdb/{util => db}/manual_compaction_test.cc (98%) create mode 100644 external/rocksdb/db/options_file_test.cc create mode 100644 external/rocksdb/db/pinned_iterators_manager.h create mode 100644 external/rocksdb/db/repair_test.cc delete mode 100644 external/rocksdb/db/slice.cc delete mode 100644 external/rocksdb/db/writebuffer.h create mode 100644 external/rocksdb/db/xfunc_test_points.cc create mode 100644 external/rocksdb/db/xfunc_test_points.h create mode 100644 external/rocksdb/examples/options_file_example.cc create mode 100644 external/rocksdb/include/rocksdb/db_bench_tool.h delete mode 100644 external/rocksdb/include/rocksdb/delete_scheduler.h create mode 100644 external/rocksdb/include/rocksdb/persistent_cache.h create mode 100644 external/rocksdb/include/rocksdb/sst_file_manager.h create mode 100644 external/rocksdb/include/rocksdb/utilities/env_librados.h create mode 100644 external/rocksdb/include/rocksdb/utilities/env_mirror.h create mode 100644 external/rocksdb/include/rocksdb/utilities/env_registry.h create mode 100644 external/rocksdb/include/rocksdb/utilities/ldb_cmd.h rename external/rocksdb/{util => include/rocksdb/utilities}/ldb_cmd_execute_result.h (95%) create mode 100644 external/rocksdb/include/rocksdb/utilities/memory_util.h create mode 100644 external/rocksdb/include/rocksdb/utilities/option_change_migration.h create mode 100644 external/rocksdb/include/rocksdb/utilities/options_util.h create mode 100644 external/rocksdb/include/rocksdb/utilities/sim_cache.h create mode 100644 external/rocksdb/include/rocksdb/wal_filter.h create mode 100644 external/rocksdb/include/rocksdb/write_buffer_manager.h create mode 100644 external/rocksdb/java/CMakeLists.txt create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/util/DirectBytewiseComparator.java create mode 100644 external/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java delete mode 100644 external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java create mode 100644 external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java create mode 100644 external/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java rename external/rocksdb/{util => memtable}/hash_cuckoo_rep.cc (98%) rename external/rocksdb/{util => memtable}/hash_cuckoo_rep.h (94%) rename external/rocksdb/{util => memtable}/hash_linklist_rep.cc (99%) rename external/rocksdb/{util => memtable}/hash_linklist_rep.h (96%) rename external/rocksdb/{util => memtable}/hash_skiplist_rep.cc (99%) rename external/rocksdb/{util => memtable}/hash_skiplist_rep.h (95%) rename external/rocksdb/{util => memtable}/skiplistrep.cc (90%) rename external/rocksdb/{util => memtable}/stl_wrappers.h (68%) rename external/rocksdb/{util => memtable}/vectorrep.cc (98%) create mode 100644 external/rocksdb/port/win/env_default.cc create mode 100644 external/rocksdb/port/win/env_win.h create mode 100644 external/rocksdb/port/win/io_win.cc create mode 100644 external/rocksdb/port/win/io_win.h create mode 100644 external/rocksdb/port/win/xpress_win.cc create mode 100644 external/rocksdb/port/win/xpress_win.h create mode 100644 external/rocksdb/port/xpress.h delete mode 100644 external/rocksdb/table/block_hash_index.cc delete mode 100644 external/rocksdb/table/block_hash_index.h delete mode 100644 external/rocksdb/table/block_hash_index_test.cc create mode 100644 external/rocksdb/table/internal_iterator.h create mode 100644 external/rocksdb/table/persistent_cache_helper.cc create mode 100644 external/rocksdb/table/persistent_cache_helper.h rename external/rocksdb/{util => table}/scoped_arena_iterator.h (56%) create mode 100644 external/rocksdb/tools/db_bench.cc rename external/rocksdb/{db/db_bench.cc => tools/db_bench_tool.cc} (88%) create mode 100644 external/rocksdb/tools/db_bench_tool_test.cc delete mode 100644 external/rocksdb/tools/db_crashtest2.py rename external/rocksdb/{util => tools}/ldb_cmd.cc (51%) create mode 100644 external/rocksdb/tools/ldb_cmd_impl.h rename external/rocksdb/{util => tools}/ldb_cmd_test.cc (55%) create mode 100644 external/rocksdb/tools/ldb_tool.cc create mode 100644 external/rocksdb/tools/regression_test.sh rename external/rocksdb/{util => tools}/sst_dump_test.cc (95%) rename external/rocksdb/{util => tools}/sst_dump_tool.cc (76%) rename external/rocksdb/{util => tools}/sst_dump_tool_imp.h (77%) create mode 100644 external/rocksdb/tools/write_stress.cc create mode 100644 external/rocksdb/tools/write_stress_runner.py create mode 100644 external/rocksdb/util/concurrent_arena.cc create mode 100644 external/rocksdb/util/concurrent_arena.h rename external/rocksdb/util/{delete_scheduler_impl.cc => delete_scheduler.cc} (62%) rename external/rocksdb/util/{delete_scheduler_impl.h => delete_scheduler.h} (71%) create mode 100644 external/rocksdb/util/env_basic_test.cc create mode 100644 external/rocksdb/util/env_chroot.cc create mode 100644 external/rocksdb/util/env_chroot.h create mode 100644 external/rocksdb/util/fault_injection_test_env.cc create mode 100644 external/rocksdb/util/fault_injection_test_env.h create mode 100644 external/rocksdb/util/histogram_windowing.cc create mode 100644 external/rocksdb/util/histogram_windowing.h create mode 100644 external/rocksdb/util/io_posix.cc create mode 100644 external/rocksdb/util/io_posix.h create mode 100644 external/rocksdb/util/iostats_context_test.cc create mode 100644 external/rocksdb/util/kv_map.h delete mode 100644 external/rocksdb/util/ldb_cmd.h delete mode 100644 external/rocksdb/util/ldb_tool.cc rename external/rocksdb/util/{cache.cc => lru_cache.cc} (63%) delete mode 100644 external/rocksdb/util/options_builder.cc create mode 100644 external/rocksdb/util/options_sanity_check.cc create mode 100644 external/rocksdb/util/options_sanity_check.h create mode 100644 external/rocksdb/util/options_settable_test.cc create mode 100644 external/rocksdb/util/random.cc create mode 100644 external/rocksdb/util/sharded_cache.cc create mode 100644 external/rocksdb/util/sharded_cache.h create mode 100644 external/rocksdb/util/sst_file_manager_impl.cc create mode 100644 external/rocksdb/util/sst_file_manager_impl.h create mode 100644 external/rocksdb/util/statistics_test.cc create mode 100644 external/rocksdb/util/stderr_logger.h create mode 100644 external/rocksdb/util/threadpool.cc create mode 100644 external/rocksdb/util/threadpool.h create mode 100644 external/rocksdb/util/transaction_test_util.cc create mode 100644 external/rocksdb/util/transaction_test_util.h create mode 100644 external/rocksdb/utilities/env_librados.cc create mode 100644 external/rocksdb/utilities/env_librados.md create mode 100644 external/rocksdb/utilities/env_librados_test.cc create mode 100644 external/rocksdb/utilities/env_mirror.cc rename external/rocksdb/{util/memenv_test.cc => utilities/env_mirror_test.cc} (63%) create mode 100644 external/rocksdb/utilities/env_registry.cc create mode 100644 external/rocksdb/utilities/env_registry_test.cc create mode 100644 external/rocksdb/utilities/memory/memory_test.cc create mode 100644 external/rocksdb/utilities/memory/memory_util.cc create mode 100644 external/rocksdb/utilities/merge_operators/max.cc create mode 100644 external/rocksdb/utilities/option_change_migration/option_change_migration.cc create mode 100644 external/rocksdb/utilities/option_change_migration/option_change_migration_test.cc create mode 100644 external/rocksdb/utilities/options/options_util.cc create mode 100644 external/rocksdb/utilities/options/options_util_test.cc create mode 100644 external/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc create mode 100644 external/rocksdb/utilities/persistent_cache/block_cache_tier_file.h create mode 100644 external/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h create mode 100644 external/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc create mode 100644 external/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h create mode 100644 external/rocksdb/utilities/persistent_cache/hash_table.h create mode 100644 external/rocksdb/utilities/persistent_cache/hash_table_bench.cc create mode 100644 external/rocksdb/utilities/persistent_cache/hash_table_evictable.h create mode 100644 external/rocksdb/utilities/persistent_cache/hash_table_test.cc create mode 100644 external/rocksdb/utilities/persistent_cache/lrulist.h create mode 100644 external/rocksdb/utilities/persistent_cache/persistent_cache_test.cc create mode 100644 external/rocksdb/utilities/persistent_cache/persistent_cache_test.h create mode 100644 external/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc create mode 100644 external/rocksdb/utilities/persistent_cache/persistent_cache_tier.h create mode 100644 external/rocksdb/utilities/persistent_cache/persistent_cache_util.h create mode 100644 external/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc create mode 100644 external/rocksdb/utilities/persistent_cache/volatile_tier_impl.h create mode 100644 external/rocksdb/utilities/simulator_cache/sim_cache.cc create mode 100644 external/rocksdb/utilities/simulator_cache/sim_cache_test.cc create mode 100644 external/rocksdb/utilities/util_merge_operators_test.cc diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index d80a5b29f7..ab1ccb002c 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -1,3 +1,8 @@ +Release notes 2.1.1 + +- Updated RocksDB to release 4.11.2 +- Added two more seed nodes + Release notes 2.1.0 - Transaction pool issue fixes diff --git a/external/rocksdb/.gitignore b/external/rocksdb/.gitignore index 6a92b5d537..0a297b4029 100644 --- a/external/rocksdb/.gitignore +++ b/external/rocksdb/.gitignore @@ -42,6 +42,7 @@ unity.a tags rocksdb_dump rocksdb_undump +db_test2 java/out java/target @@ -63,3 +64,7 @@ java/javadoc scan_build_report/ t LOG + +db_logs/ +tp2/ +fbcode/ diff --git a/external/rocksdb/.travis.yml b/external/rocksdb/.travis.yml index 804554ca50..85eb6bfa06 100644 --- a/external/rocksdb/.travis.yml +++ b/external/rocksdb/.travis.yml @@ -1,24 +1,29 @@ sudo: false language: cpp +os: + - linux + - osx +compiler: + - clang -matrix: - include: - - os: linux - compiler: clang - env: COMPILER=clang++-3.6 - addons: - apt: - sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.6'] - packages: ['clang-3.6', 'clang-format-3.6', 'zlib1g-dev', 'libbz2-dev', 'libsnappy-dev', 'curl'] - - os: osx - compiler: clang +addons: + apt: + sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.6'] + packages: ['clang-3.6' , 'zlib1g-dev', 'libbz2-dev', 'libsnappy-dev', 'curl'] +env: + # Run all tests before db_block_cache_test (db_test, db_test2) + - JOB_NAME=unittests ROCKSDBTESTS_END=db_block_cache_test + # Run all tests starting from db_block_cache_test (db_block_cache_test, db_iter_test, ...) + - JOB_NAME=unittests ROCKSDBTESTS_START=db_block_cache_test + # Run java tests + - JOB_NAME=java_test + # Build ROCKSDB_LITE + - JOB_NAME=lite_build install: # Build gflags # TODO(noetzli): Remove when gflags available through Travis - pushd /tmp/ && curl -L https://github.com/gflags/gflags/archive/v2.1.2.tar.gz -o gflags.tar.gz && tar xfz gflags.tar.gz && cd gflags-2.1.2 && cmake . && make && popd - # Download clang-format-diff.py to check source code formatting - - pushd /tmp/ && curl -L http://llvm.org/svn/llvm-project/cfe/trunk/tools/clang-format/clang-format-diff.py -o clang-format-diff.py && chmod +x clang-format-diff.py && popd before_script: # Add gflags to include/library paths @@ -26,16 +31,15 @@ before_script: - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/tmp/gflags-2.1.2/lib" - export LIBRARY_PATH="$LIBRARY_PATH:/tmp/gflags-2.1.2/lib" - export CPLUS_INCLUDE_PATH="$CPLUS_INCLUDE_PATH:/tmp/gflags-2.1.2/include" - - if [ -n "${COMPILER}" ]; then CXX=${COMPILER}; fi + - if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then CXX=clang++-3.6; fi - if [[ "${TRAVIS_OS_NAME}" == 'osx' ]]; then brew install gflags snappy; fi + # Limit the maximum number of open file descriptors to 2000 - ulimit -n 2000 || true -# Lousy hack to disable use and testing of fallocate, which doesn't behave quite -# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. script: - - if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format || true; fi - - OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest - + - if [[ "${JOB_NAME}" == 'unittests' ]]; then OPT=-DTRAVIS V=1 make -j4 check_some; fi + - if [[ "${JOB_NAME}" == 'java_test' ]]; then OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest; fi + - if [[ "${JOB_NAME}" == 'lite_build' ]]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib; fi notifications: email: - leveldb@fb.com diff --git a/external/rocksdb/CMakeLists.txt b/external/rocksdb/CMakeLists.txt index 6a691c08b6..26e5a6f770 100644 --- a/external/rocksdb/CMakeLists.txt +++ b/external/rocksdb/CMakeLists.txt @@ -13,25 +13,41 @@ # cd build # 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries. # See thirdparty.inc for more information. -# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 .. -# 4. Then build the project in debug mode (you may want to add /m: flag to run msbuild in parallel threads) -# msbuild ALL_BUILD.vcxproj +# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 -DJNI=1 .. +# OR for VS Studio 15 cmake -G "Visual Studio 14 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 -DJNI=1 .. +# 4. Then build the project in debug mode (you may want to add /m[:] flag to run msbuild in parallel threads +# or simply /m ot use all avail cores) +# msbuild rocksdb.sln +# +# rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything +# will be attempted but test only code does not build in Release mode. +# # 5. And release mode (/m[:] is also supported) -# msbuild ALL_BUILD.vcxproj /p:Configuration=Release +# msbuild rocksdb.sln /p:Configuration=Release # cmake_minimum_required(VERSION 2.6) project(rocksdb) -execute_process(COMMAND $ENV{COMSPEC} " /C date /T" OUTPUT_VARIABLE DATE) -execute_process(COMMAND $ENV{COMSPEC} " /C time /T" OUTPUT_VARIABLE TIME) -string(REGEX REPLACE "(..)/(..)/..(..).*" "\\1/\\2/\\3" DATE ${DATE}) -string(REGEX REPLACE "(..):(.....).*" " \\1:\\2" TIME ${TIME}) -#string(CONCAT GIT_DATE_TIME ${DATE} ${TIME}) +execute_process(COMMAND powershell -Command "Get-Date -format MM_dd_yyyy" OUTPUT_VARIABLE DATE) +execute_process(COMMAND powershell -Command "Get-Date -format HH:mm:ss" OUTPUT_VARIABLE TIME) +string(REGEX REPLACE "(..)_(..)_..(..).*" "\\1/\\2/\\3" DATE "${DATE}") +string(REGEX REPLACE "(..):(.....).*" " \\1:\\2" TIME "${TIME}") set(GIT_DATE_TIME ${DATE}${TIME}) + string(REGEX REPLACE "\n" "" GIT_DATE_TIME ${GIT_DATE_TIME}) string(REGEX REPLACE "\r" "" GIT_DATE_TIME ${GIT_DATE_TIME}) -set(GIT_SHA "Unknown") + +find_package(Git) + +if (GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") + execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) +else() + set(GIT_SHA 0) +endif() + +string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}") + set(BUILD_VERSION_CC ${CMAKE_CURRENT_SOURCE_DIR}/util/build_version.cc) add_custom_command(OUTPUT ${BUILD_VERSION_CC} @@ -44,7 +60,9 @@ add_custom_command(OUTPUT ${BUILD_VERSION_CC} add_custom_target(GenerateBuildVersion DEPENDS ${BUILD_VERSION_CC}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /WX /wd4018 /wd4100 /wd4101 /wd4127 /wd4189 /wd4200 /wd4244 /wd4267 /wd4296 /wd4305 /wd4307 /wd4309 /wd4512 /wd4701 /wd4702 /wd4800 /wd4804 /wd4996") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /WX /wd4127 /wd4800 /wd4996 /wd4351 /wd4244") + # Used to run CI build and tests so we can run faster set(OPTIMIZE_DEBUG_DEFAULT 0) # Debug build is unoptimized by default use -DOPTDBG=1 to optimize @@ -74,16 +92,18 @@ if(STATIC) endforeach() endif() -add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64) +add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DNOMINMAX) include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) -#include_directories(${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) set(ROCKSDB_LIBS rocksdblib${ARTIFACT_SUFFIX}) + set(LIBS ${ROCKSDB_LIBS} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) +# Main library source code set(SOURCES + db/auto_roll_logger.cc db/builder.cc db/c.cc db/column_family.cc @@ -98,7 +118,9 @@ set(SOURCES db/db_impl.cc db/db_impl_debug.cc db/db_impl_experimental.cc + db/db_impl_add_file.cc db/db_impl_readonly.cc + db/db_info_dumper.cc db/db_iter.cc db/event_helpers.cc db/experimental.cc @@ -117,7 +139,6 @@ set(SOURCES db/merge_helper.cc db/merge_operator.cc db/repair.cc - db/slice.cc db/snapshot_impl.cc db/table_cache.cc db/table_properties_collector.cc @@ -130,10 +151,19 @@ set(SOURCES db/write_batch_base.cc db/write_controller.cc db/write_thread.cc + db/xfunc_test_points.cc + memtable/hash_cuckoo_rep.cc + memtable/hash_linklist_rep.cc + memtable/hash_skiplist_rep.cc + memtable/skiplistrep.cc + memtable/vectorrep.cc port/stack_trace.cc + port/win/io_win.cc port/win/env_win.cc + port/win/env_default.cc port/win/port_win.cc port/win/win_logger.cc + port/win/xpress_win.cc table/adaptive_table_factory.cc table/block.cc table/block_based_filter_block.cc @@ -141,7 +171,6 @@ set(SOURCES table/block_based_table_factory.cc table/block_based_table_reader.cc table/block_builder.cc - table/block_hash_index.cc table/block_prefix_index.cc table/bloom_block.cc table/cuckoo_table_builder.cc @@ -155,59 +184,58 @@ set(SOURCES table/merger.cc table/sst_file_writer.cc table/meta_blocks.cc - table/mock_table.cc table/plain_table_builder.cc table/plain_table_factory.cc table/plain_table_index.cc table/plain_table_key_coding.cc table/plain_table_reader.cc + table/persistent_cache_helper.cc table/table_properties.cc table/two_level_iterator.cc + tools/sst_dump_tool.cc + tools/db_bench_tool.cc tools/dump/db_dump_tool.cc util/arena.cc - util/auto_roll_logger.cc util/bloom.cc util/build_version.cc - util/cache.cc util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc + util/concurrent_arena.cc util/crc32c.cc - util/db_info_dumper.cc - util/delete_scheduler_impl.cc - util/db_test_util.cc + util/delete_scheduler.cc util/dynamic_bloom.cc util/env.cc + util/env_chroot.cc util/env_hdfs.cc util/event_logger.cc util/file_util.cc util/file_reader_writer.cc + util/sst_file_manager_impl.cc util/filter_policy.cc util/hash.cc - util/hash_cuckoo_rep.cc - util/hash_linklist_rep.cc - util/hash_skiplist_rep.cc util/histogram.cc + util/histogram_windowing.cc util/instrumented_mutex.cc util/iostats_context.cc - util/ldb_cmd.cc - util/ldb_tool.cc + util/lru_cache.cc + tools/ldb_cmd.cc + tools/ldb_tool.cc util/logging.cc util/log_buffer.cc util/memenv.cc - util/mock_env.cc util/murmurhash.cc util/mutable_cf_options.cc util/options.cc - util/options_builder.cc util/options_helper.cc util/options_parser.cc + util/options_sanity_check.cc util/perf_context.cc util/perf_level.cc + util/random.cc util/rate_limiter.cc - util/skiplistrep.cc + util/sharded_cache.cc util/slice.cc - util/sst_dump_tool.cc util/statistics.cc util/status.cc util/status_message.cc @@ -216,27 +244,37 @@ set(SOURCES util/testharness.cc util/testutil.cc util/thread_local.cc + util/threadpool.cc util/thread_status_impl.cc util/thread_status_updater.cc - util/thread_status_updater_debug.cc util/thread_status_util.cc util/thread_status_util_debug.cc - util/vectorrep.cc + util/transaction_test_util.cc util/xfunc.cc util/xxhash.cc utilities/backupable/backupable_db.cc utilities/checkpoint/checkpoint.cc + utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc utilities/document/document_db.cc utilities/document/json_document.cc utilities/document/json_document_builder.cc + utilities/env_mirror.cc + utilities/env_registry.cc utilities/flashcache/flashcache.cc utilities/geodb/geodb_impl.cc utilities/leveldb_options/leveldb_options.cc + utilities/memory/memory_util.cc utilities/merge_operators/string_append/stringappend.cc utilities/merge_operators/string_append/stringappend2.cc utilities/merge_operators/put.cc + utilities/merge_operators/max.cc utilities/merge_operators/uint64add.cc + utilities/option_change_migration/option_change_migration.cc + utilities/options/options_util.cc + utilities/persistent_cache/persistent_cache_tier.cc + utilities/persistent_cache/volatile_tier_impl.cc utilities/redis/redis_lists.cc + utilities/simulator_cache/sim_cache.cc utilities/spatialdb/spatial_db.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/transactions/optimistic_transaction_impl.cc diff --git a/external/rocksdb/DEFAULT_OPTIONS_HISTORY.md b/external/rocksdb/DEFAULT_OPTIONS_HISTORY.md new file mode 100644 index 0000000000..e4ad3a7285 --- /dev/null +++ b/external/rocksdb/DEFAULT_OPTIONS_HISTORY.md @@ -0,0 +1,14 @@ +# RocksDB default options change log +## 4.8.0 (5/2/2016) +* options.max_open_files changes from 5000 to -1. It improves performance, but users need to set file descriptor limit to be large enough and watch memory usage for index and bloom filters. +* options.base_background_compactions changes from max_background_compactions to 1. When users set higher max_background_compactions but the write throughput is not high, the writes are less spiky to disks. +* options.wal_recovery_mode changes from kTolerateCorruptedTailRecords to kPointInTimeRecovery. Avoid some false positive when file system or hardware reorder the writes for file data and metadata. + +## 4.7.0 (4/8/2016) +* options.write_buffer_size changes from 4MB to 64MB. +* options.target_file_size_base changes from 2MB to 64MB. +* options.max_bytes_for_level_base changes from 10MB to 256MB. +* options.soft_pending_compaction_bytes_limit changes from 0 (disabled) to 64GB. +* options.hard_pending_compaction_bytes_limit changes from 0 (disabled) to 256GB. +* table_cache_numshardbits changes from 4 to 6. +* max_file_opening_threads changes from 1 to 16. diff --git a/external/rocksdb/HISTORY.md b/external/rocksdb/HISTORY.md index 7b64daf5fe..1121bf7852 100644 --- a/external/rocksdb/HISTORY.md +++ b/external/rocksdb/HISTORY.md @@ -1,4 +1,113 @@ # Rocksdb Change Log +## 4.11.2 (9/15/2016) +### Bug fixes +* Segfault when failing to open an SST file for read-ahead iterators. +* WAL without data for all CFs is not deleted after recovery. + +## 4.11.1 (8/30/2016) +### Bug Fixes +* Mitigate the regression bug of deadlock condition during recovery when options.max_successive_merges hits. +* Fix data race condition related to hash index in block based table when putting indexes in the block cache. + +## 4.11.0 (8/1/2016) +### Public API Change +* options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter. + +### New Features +* A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h. +* Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destorying iterators. + +## 4.10.0 (7/5/2016) +### Public API Change +* options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes +* enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one. +* Deprecate options.filter_deletes. + +### New Features +* Add avoid_flush_during_recovery option. +* Add a read option background_purge_on_iterator_cleanup to avoid deleting files in foreground when destroying iterators. Instead, a job is scheduled in high priority queue and would be executed in a separate background thread. +* RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family. +* Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances. + +## 4.9.0 (6/9/2016) +### Public API changes +* Add bottommost_compression option, This option can be used to set a specific compression algorithm for the bottommost level (Last level containing files in the DB). +* Introduce CompactionJobInfo::compression, This field state the compression algorithm used to generate the output files of the compaction. +* Deprecate BlockBaseTableOptions.hash_index_allow_collision=false +* Deprecate options builder (GetOptions()). + +### New Features +* Introduce NewSimCache() in rocksdb/utilities/sim_cache.h. This function creates a block cache that is able to give simulation results (mainly hit rate) of simulating block behavior with a configurable cache size. + +## 4.8.0 (5/2/2016) +### Public API Change +* Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes. +* Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F +* Expose estimate of per-level compression ratio via DB property: "rocksdb.compression-ratio-at-levelN". +* Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status. + +### New Features +* Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size. + +## 4.7.0 (4/8/2016) +### Public API Change +* rename options compaction_measure_io_stats to report_bg_io_stats and include flush too. +* Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options. + +## 4.6.0 (3/10/2016) +### Public API Changes +* Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier. +* Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly. +* Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted. +* Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree. + +### New Features +* Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. +* Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned" + +## 4.5.0 (2/5/2016) +### Public API Changes +* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes. +* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll. +* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead + +### New Features +* ldb tool now supports operations to non-default column families. +* Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true. +* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate. + +## 4.4.0 (1/14/2016) +### Public API Changes +* Change names in CompactionPri and add a new one. +* Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit. +* If options.max_write_buffer_number > 3, writes will be slowed down when writing to the last write buffer to delay a full stop. +* Introduce CompactionJobInfo::compaction_reason, this field include the reason to trigger the compaction. +* After slow down is triggered, if estimated pending compaction bytes keep increasing, slowdown more. +* Increase default options.delayed_write_rate to 2MB/s. +* Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb. + +## 4.3.0 (12/8/2015) +### New Features +* CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key. +* RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions. +* Introduce LoadLatestOptions() in rocksdb/utilities/options_util.h. This function can construct the latest DBOptions / ColumnFamilyOptions used by the specified RocksDB intance. +* Introduce CheckOptionsCompatibility() in rocksdb/utilities/options_util.h. This function checks whether the input set of options is able to open the specified DB successfully. + +### Public API Changes +* When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families. + +## 4.2.0 (11/9/2015) +### New Features +* Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions. +* Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families. +* Add MemoryUtil in rocksdb/utilities/memory.h. It currently offers a way to get the memory usage by type from a list rocksdb instances. + +### Public API Changes +* CompactionFilter::Context includes information of Column Family ID +* The need-compaction hint given by TablePropertiesCollector::NeedCompact() will be persistent and recoverable after DB recovery. This introduces a breaking format change. If you use this experimental feature, including NewCompactOnDeletionCollectorFactory() in the new version, you may not be able to directly downgrade the DB back to version 4.0 or lower. +* TablePropertiesCollectorFactory::CreateTablePropertiesCollector() now takes an option Context, containing the information of column family ID for the file being written. +* Remove DefaultCompactionFilterFactory. + ## 4.1.0 (10/8/2015) ### New Features @@ -64,8 +173,8 @@ * options.hard_rate_limit is deprecated. * When options.soft_rate_limit or options.level0_slowdown_writes_trigger is triggered, the way to slow down writes is changed to: write rate to DB is limited to to options.delayed_write_rate. * DB::GetApproximateSizes() adds a parameter to allow the estimation to include data in mem table, with default to be not to include. It is now only supported in skip list mem table. -* DB::CompactRange() now accept CompactRangeOptions instead of multiple paramters. CompactRangeOptions is defined in include/rocksdb/options.h. -* CompactRange() will now skip bottommost level compaction for level based compaction if there is no compaction filter, bottommost_level_compaction is introduced in CompactRangeOptions to control when it's possbile to skip bottommost level compaction. This mean that if you want the compaction to produce a single file you need to set bottommost_level_compaction to BottommostLevelCompaction::kForce. +* DB::CompactRange() now accept CompactRangeOptions instead of multiple parameters. CompactRangeOptions is defined in include/rocksdb/options.h. +* CompactRange() will now skip bottommost level compaction for level based compaction if there is no compaction filter, bottommost_level_compaction is introduced in CompactRangeOptions to control when it's possible to skip bottommost level compaction. This mean that if you want the compaction to produce a single file you need to set bottommost_level_compaction to BottommostLevelCompaction::kForce. * Add Cache.GetPinnedUsage() to get the size of memory occupied by entries that are in use by the system. * DB:Open() will fail if the compression specified in Options is not linked with the binary. If you see this failure, recompile RocksDB with compression libraries present on your system. Also, previously our default compression was snappy. This behavior is now changed. Now, the default compression is snappy only if it's available on the system. If it isn't we change the default to kNoCompression. * We changed how we account for memory used in block cache. Previously, we only counted the sum of block sizes currently present in block cache. Now, we count the actual memory usage of the blocks. For example, a block of size 4.5KB will use 8KB memory with jemalloc. This might decrease your memory usage and possibly decrease performance. Increase block cache size if you see this happening after an upgrade. @@ -97,7 +206,7 @@ Lower numbered levels will be placed earlier in the db_paths and higher numbered levels will be placed later in the db_paths vector. * Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000) -* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB. +* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guideline, newer versions have more features, but might not be readable by older versions of RocksDB. * Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions. * MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv(). * Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash. @@ -173,7 +282,7 @@ ## 3.5.0 (9/3/2014) ### New Features -* Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it. +* Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it. * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer. * Remove deprecated options: disable_seek_compaction and db_stats_log_interval @@ -187,7 +296,7 @@ * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0. -* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries. +* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve performance in case there are a large number of delete markers or overwritten entries. ### Public API changes * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size @@ -202,7 +311,7 @@ ### New Features * Added JSON API prototype. * HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory(). -* RocksDB is now able to reclaim storage space more effectively during the compaction process. This is done by compensating the size of each deletion entry by the 2X average value size, which makes compaction to be triggerred by deletion entries more easily. +* RocksDB is now able to reclaim storage space more effectively during the compaction process. This is done by compensating the size of each deletion entry by the 2X average value size, which makes compaction to be triggered by deletion entries more easily. * Add TimeOut API to write. Now WriteOptions have a variable called timeout_hint_us. With timeout_hint_us set to non-zero, any write associated with this timeout_hint_us may be aborted when it runs longer than the specified timeout_hint_us, and it is guaranteed that any write completes earlier than the specified time-out will not be aborted due to the time-out condition. * Add a rate_limiter option, which controls total throughput of flush and compaction. The throughput is specified in bytes/sec. Flush always has precedence over compaction when available bandwidth is constrained. @@ -217,11 +326,11 @@ 2) It added some complexity to the important code-paths, 3) None of our internal customers were really using it. Because of that, Options::disable_seek_compaction is now obsolete. It is still a parameter in Options, so it does not break the build, but it does not have any effect. We plan to completely remove it at some point, so we ask users to please remove this option from your code base. -* Add two paramters to NewHashLinkListRepFactory() for logging on too many entries in a hash bucket when flushing. +* Add two parameters to NewHashLinkListRepFactory() for logging on too many entries in a hash bucket when flushing. * Added new option BlockBasedTableOptions::hash_index_allow_collision. When enabled, prefix hash index for block-based table will not store prefix and allow hash collision, reducing memory consumption. ### New Features -* PlainTable now supports a new key encoding: for keys of the same prefix, the prefix is only written once. It can be enabled through encoding_type paramter of NewPlainTableFactory() +* PlainTable now supports a new key encoding: for keys of the same prefix, the prefix is only written once. It can be enabled through encoding_type parameter of NewPlainTableFactory() * Add AdaptiveTableFactory, which is used to convert from a DB of PlainTable to BlockBasedTabe, or vise versa. It can be created using NewAdaptiveTableFactory() ### Performance Improvements diff --git a/external/rocksdb/INSTALL.md b/external/rocksdb/INSTALL.md index 50b27c80d1..3669bf1cf1 100644 --- a/external/rocksdb/INSTALL.md +++ b/external/rocksdb/INSTALL.md @@ -1,22 +1,27 @@ ## Compilation +**Important**: If you plan to run RocksDB in production, don't compile using default +`make` or `make all`. That will compile RocksDB in debug mode, which is much slower +than release mode. + RocksDB's library should be able to compile without any dependency installed, although we recommend installing some compression libraries (see below). We do depend on newer gcc/clang with C++11 support. There are few options when compiling RocksDB: -* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. +* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. Compiles static library in release mode. -* `make shared_lib` will compile librocksdb.so, RocksDB shared library. +* `make shared_lib` will compile librocksdb.so, RocksDB shared library. Compiles shared library in release mode. -* `make check` will compile and run all the unit tests +* `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode. * `make all` will compile our static library, and all our tools and unit tests. Our tools -depend on gflags. You will need to have gflags installed to run `make all`. +depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't +use binaries compiled by `make all` in production. * By default the binary we produce is optimized for the platform you're compiling on -(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before +(-march=native or the equivalent). If you want to build a portable binary, add 'PORTABLE=1' before your make commands, like this: `PORTABLE=1 make static_lib` ## Dependencies @@ -82,4 +87,5 @@ your make commands, like this: `PORTABLE=1 make static_lib` * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`. * **Windows**: + * For building with MS Visual Studio 13 you will need Update 4 installed. * Read and follow the instructions at CMakeLists.txt diff --git a/external/rocksdb/LANGUAGE-BINDINGS.md b/external/rocksdb/LANGUAGE-BINDINGS.md new file mode 100644 index 0000000000..e9aaf2f16a --- /dev/null +++ b/external/rocksdb/LANGUAGE-BINDINGS.md @@ -0,0 +1,12 @@ +This is the list of all known third-party language bindings for RocksDB. If something is missing, please open a pull request to add it. + +* Java - https://github.com/facebook/rocksdb/tree/master/java +* Python - http://pyrocksdb.readthedocs.org/en/latest/ +* Perl - https://metacpan.org/pod/RocksDB +* Node.js - https://npmjs.org/package/rocksdb +* Go - https://github.com/tecbot/gorocksdb +* Ruby - http://rubygems.org/gems/rocksdb-ruby +* Haskell - https://hackage.haskell.org/package/rocksdb-haskell +* PHP - https://github.com/Photonios/rocksdb-php +* C# - https://github.com/warrenfalk/rocksdb-sharp +* Rust - https://github.com/spacejam/rust-rocksdb diff --git a/external/rocksdb/LICENSE b/external/rocksdb/LICENSE index b132901869..46f685e968 100644 --- a/external/rocksdb/LICENSE +++ b/external/rocksdb/LICENSE @@ -2,7 +2,7 @@ BSD License For rocksdb software -Copyright (c) 2014, Facebook, Inc. +Copyright (c) 2011-present, Facebook, Inc. All rights reserved. --------------------------------------------------------------------- diff --git a/external/rocksdb/Makefile b/external/rocksdb/Makefile index da396c65c1..857c521019 100644 --- a/external/rocksdb/Makefile +++ b/external/rocksdb/Makefile @@ -15,7 +15,9 @@ ARFLAGS = rs # Transform parallel LOG output into something more readable. perl_command = perl -n \ - -e '@a=split("\t",$$_,-1); $$t=$$a[8]; $$t =~ s,^\./,,;' \ + -e '@a=split("\t",$$_,-1); $$t=$$a[8];' \ + -e '$$t =~ /.*if\s\[\[\s"(.*?\.[\w\/]+)/ and $$t=$$1;' \ + -e '$$t =~ s,^\./,,;' \ -e '$$t =~ s, >.*,,; chomp $$t;' \ -e '$$t =~ /.*--gtest_filter=(.*?\.[\w\/]+)/ and $$t=$$1;' \ -e 'printf "%7.3f %s %s\n", $$a[3], $$a[6] == 0 ? "PASS" : "FAIL", $$t' @@ -33,12 +35,22 @@ quoted_perl_command = $(subst ','\'',$(perl_command)) # with debug level 0. To compile with level 0, run `make shared_lib`, # `make install-shared`, `make static_lib`, `make install-static` or # `make install` -DEBUG_LEVEL=1 + +# Set the default DEBUG_LEVEL to 1 +DEBUG_LEVEL?=1 ifeq ($(MAKECMDGOALS),dbg) DEBUG_LEVEL=2 endif +ifeq ($(MAKECMDGOALS),clean) + DEBUG_LEVEL=0 +endif + +ifeq ($(MAKECMDGOALS),release) + DEBUG_LEVEL=0 +endif + ifeq ($(MAKECMDGOALS),shared_lib) DEBUG_LEVEL=0 endif @@ -74,7 +86,8 @@ endif # compile with -O2 if debug level is not 2 ifneq ($(DEBUG_LEVEL), 2) OPT += -O2 -fno-omit-frame-pointer -ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer +# Skip for archs that don't support -momit-leaf-frame-pointer +ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1)) OPT += -momit-leaf-frame-pointer endif endif @@ -84,6 +97,8 @@ endif ifeq ($(DEBUG_LEVEL),0) OPT += -DNDEBUG DISABLE_WARNING_AS_ERROR=1 +else +$(warning Warning: Compiling in debug mode. Don't use the resulting binary in production) endif #----------------------------------------------- @@ -115,14 +130,27 @@ am__v_AR_ = $(am__v_AR_$(AM_DEFAULT_VERBOSITY)) am__v_AR_0 = @echo " AR " $@; am__v_AR_1 = -AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +ifdef ROCKSDB_USE_LIBRADOS +LIB_SOURCES += utilities/env_librados.cc +LDFLAGS += -lrados +endif +AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) # detect what platform we're building on dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) # this file is generated by the previous line to set build flags and sources include make_config.mk CLEAN_FILES += make_config.mk +missing_make_config_paths := $(shell \ + grep "\/\S*" -o $(CURDIR)/make_config.mk | \ + while read path; \ + do [ -e $$path ] || echo $$path; \ + done | sort | uniq) + +$(foreach path, $(missing_make_config_paths), \ + $(warning Warning: $(path) dont exist)) + ifneq ($(PLATFORM), IOS) CFLAGS += -g CXXFLAGS += -g @@ -131,6 +159,9 @@ else OPT += -DNDEBUG endif +ifeq ($(PLATFORM), OS_SOLARIS) + PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99 +endif ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) # found CFLAGS += -fno-exceptions @@ -153,12 +184,22 @@ ifdef COMPILE_WITH_TSAN PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN # Turn off -pg when enabling TSAN testing, because that induces # a link failure. TODO: find the root cause - pg = -else - pg = -pg + PROFILING_FLAGS = +endif + +# USAN doesn't work well with jemalloc. If we're compiling with USAN, we should use regular malloc. +ifdef COMPILE_WITH_UBSAN + DISABLE_JEMALLOC=1 + EXEC_LDFLAGS += -fsanitize=undefined + PLATFORM_CCFLAGS += -fsanitize=undefined + PLATFORM_CXXFLAGS += -fsanitize=undefined endif ifndef DISABLE_JEMALLOC + ifdef JEMALLOC + PLATFORM_CXXFLAGS += "-DROCKSDB_JEMALLOC" + PLATFORM_CCFLAGS += "-DROCKSDB_JEMALLOC" + endif EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS) PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) @@ -214,8 +255,8 @@ util/build_version.cc: FORCE else mv -f $@-t $@; fi LIBOBJECTS = $(LIB_SOURCES:.cc=.o) -LIBOBJECTS += $(TOOL_SOURCES:.cc=.o) -MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o) +LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o) +MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o) GTEST = $(GTEST_DIR)/gtest/gtest-all.o TESTUTIL = ./util/testutil.o @@ -225,18 +266,29 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full +BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) + TESTS = \ db_test \ + db_test2 \ + db_block_cache_test \ + db_bloom_filter_test \ db_iter_test \ db_log_iter_test \ db_compaction_filter_test \ db_compaction_test \ db_dynamic_level_test \ + db_flush_test \ db_inplace_update_test \ + db_iterator_test \ + db_options_test \ + db_sst_test \ db_tailing_iter_test \ db_universal_compaction_test \ db_wal_test \ - block_hash_index_test \ + db_io_failure_test \ + db_properties_test \ + db_table_properties_test \ autovector_test \ column_family_test \ table_properties_collector_test \ @@ -253,6 +305,7 @@ TESTS = \ crc32c_test \ slice_transform_test \ dbformat_test \ + env_basic_test \ env_test \ fault_injection_test \ filelock_test \ @@ -260,15 +313,19 @@ TESTS = \ file_reader_writer_test \ block_based_filter_block_test \ full_filter_block_test \ + hash_table_test \ histogram_test \ + inlineskiplist_test \ log_test \ manual_compaction_test \ - memenv_test \ mock_env_test \ memtable_list_test \ merge_helper_test \ + memory_test \ merge_test \ merger_test \ + util_merge_operators_test \ + options_file_test \ redis_test \ reduce_levels_test \ plain_table_db_test \ @@ -280,6 +337,7 @@ TESTS = \ backupable_db_test \ document_db_test \ json_document_test \ + sim_cache_test \ spatial_db_test \ version_edit_test \ version_set_test \ @@ -296,6 +354,8 @@ TESTS = \ rate_limiter_test \ delete_scheduler_test \ options_test \ + options_settable_test \ + options_util_test \ event_logger_test \ cuckoo_table_builder_test \ cuckoo_table_reader_test \ @@ -314,28 +374,62 @@ TESTS = \ heap_test \ compact_on_deletion_collector_test \ compaction_job_stats_test \ + option_change_migration_test \ transaction_test \ - ldb_cmd_test + ldb_cmd_test \ + iostats_context_test \ + persistent_cache_test \ + statistics_test \ + +PARALLEL_TEST = \ + backupable_db_test \ + compact_on_deletion_collector_test \ + db_compaction_filter_test \ + db_compaction_test \ + db_sst_test \ + db_test \ + db_universal_compaction_test \ + fault_injection_test \ + inlineskiplist_test \ + manual_compaction_test \ + table_test + +SUBSET := $(TESTS) +ifdef ROCKSDBTESTS_START + SUBSET := $(shell echo $(SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/') +endif -SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/) +ifdef ROCKSDBTESTS_END + SUBSET := $(shell echo $(SUBSET) | sed 's/$(ROCKSDBTESTS_END).*//') +endif TOOLS = \ sst_dump \ db_sanity_test \ db_stress \ + write_stress \ ldb \ db_repl_stress \ rocksdb_dump \ rocksdb_undump +TEST_LIBS = \ + librocksdb_env_basic_test.a + +# TODO: add back forward_iterator_bench, after making it build in all environemnts. BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench -# The library name is configurable since we are maintaining libraries of both -# debug/release mode. +# if user didn't config LIBNAME, set the default ifeq ($(LIBNAME),) +# we should only run rocksdb in production with DEBUG_LEVEL 0 +ifeq ($(DEBUG_LEVEL),0) LIBNAME=librocksdb +else + LIBNAME=librocksdb_debug +endif endif LIBRARY = ${LIBNAME}.a +TOOLS_LIBRARY = ${LIBNAME}_tools.a ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) @@ -379,7 +473,7 @@ $(SHARED3): $(SHARED4) endif $(SHARED4): - $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIB_SOURCES) \ + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIB_SOURCES) $(TOOL_LIB_SOURCES) \ $(LDFLAGS) -o $@ endif # PLATFORM_SHARED_EXT @@ -387,7 +481,7 @@ endif # PLATFORM_SHARED_EXT .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \ - analyze tools + analyze tools tools_lib all: $(LIBRARY) tools @@ -398,12 +492,16 @@ shared_lib: $(SHARED) tools: $(TOOLS) +tools_lib: $(TOOLS_LIBRARY) + +test_libs: $(TEST_LIBS) + dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS) # creates static library and programs release: $(MAKE) clean - OPT="-DNDEBUG -O2" $(MAKE) static_lib tools db_bench + DEBUG_LEVEL=0 $(MAKE) static_lib tools db_bench coverage: $(MAKE) clean @@ -412,7 +510,26 @@ coverage: # Delete intermediate files find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; -# Extract the names of its tests by running db_test with --gtest_list_tests. +ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),) +# Use /dev/shm if it has the sticky bit set (otherwise, /tmp), +# and create a randomly-named rocksdb.XXXX directory therein. +# We'll use that directory in the "make check" rules. +ifeq ($(TMPD),) +TMPD := $(shell f=/dev/shm; test -k $$f || f=/tmp; \ + perl -le 'use File::Temp "tempdir";' \ + -e 'print tempdir("'$$f'/rocksdb.XXXX", CLEANUP => 0)') +endif +endif + +# Run all tests in parallel, accumulating per-test logs in t/log-*. +# +# Each t/run-* file is a tiny generated bourne shell script that invokes one of +# sub-tests. Why use a file for this? Because that makes the invocation of +# parallel below simpler, which in turn makes the parsing of parallel's +# LOG simpler (the latter is for live monitoring as parallel +# tests run). +# +# Test names are extracted by running tests with --gtest_list_tests. # This filter removes the "#"-introduced comments, and expands to # fully-qualified names by changing input like this: # @@ -430,52 +547,33 @@ coverage: # MultiThreaded/MultiThreadedDBTest.MultiThreaded/0 # MultiThreaded/MultiThreadedDBTest.MultiThreaded/1 # -test_names = \ - ./db_test --gtest_list_tests \ - | perl -n \ - -e 's/ *\#.*//;' \ - -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};' \ - -e 'print qq! $$p$$2!' - -ifeq ($(MAKECMDGOALS),check) -# Use /dev/shm if it has the sticky bit set (otherwise, /tmp), -# and create a randomly-named rocksdb.XXXX directory therein. -# We'll use that directory in the "make check" rules. -ifeq ($(TMPD),) -TMPD := $(shell f=/dev/shm; test -k $$f || f=/tmp; \ - perl -le 'use File::Temp "tempdir";' \ - -e 'print tempdir("'$$f'/rocksdb.XXXX", CLEANUP => 0)') -endif -endif - -ifneq ($(T),) - -# Run all tests in parallel, accumulating per-test logs in t/log-*. - -# t_sanitized is each $(T) with "-" in place of each "/". -t_sanitized = $(subst /,-,$(T)) -# t_run is each sanitized name with a leading "t/". -t_run = $(patsubst %,t/%,$(t_sanitized)) +parallel_tests = $(patsubst %,parallel_%,$(PARALLEL_TEST)) +.PHONY: gen_parallel_tests $(parallel_tests) +$(parallel_tests): $(PARALLEL_TEST) + $(AM_V_at)TEST_BINARY=$(patsubst parallel_%,%,$@); \ + TEST_NAMES=` \ + ./$$TEST_BINARY --gtest_list_tests \ + | perl -n \ + -e 's/ *\#.*//;' \ + -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};' \ + -e 'print qq! $$p$$2!'`; \ + for TEST_NAME in $$TEST_NAMES; do \ + TEST_SCRIPT=t/run-$$TEST_BINARY-$${TEST_NAME//\//-}; \ + echo " GEN " $$TEST_SCRIPT; \ + printf '%s\n' \ + '#!/bin/sh' \ + "d=\$(TMPD)$$TEST_SCRIPT" \ + 'mkdir -p $$d' \ + "TEST_TMPDIR=\$$d ./$$TEST_BINARY --gtest_filter=$$TEST_NAME" \ + > $$TEST_SCRIPT; \ + chmod a=rx $$TEST_SCRIPT; \ + done -# Each t_run file is a tiny generated bourne shell script -# that invokes one of db_tests's sub-tests. Why use a file -# for this? Because that makes the invocation of parallel -# below simpler, which in turn makes the parsing of parallel's -# LOG simpler (the latter is for live monitoring as parallel -# tests run). -filter = --gtest_filter=$(subst -,/,$(@F)) -$(t_run): Makefile db_test - $(AM_V_GEN)mkdir -p t - $(AM_V_at)rm -f $@ $@-t - $(AM_V_at)printf '%s\n' \ - '#!/bin/sh' \ - 'd=$(TMPD)/$(@F)' \ - 'mkdir -p $$d' \ - 'TEST_TMPDIR=$$d ./db_test $(filter)' \ - > $@-t - $(AM_V_at)chmod a=rx $@-t - $(AM_V_at)mv $@-t $@ +gen_parallel_tests: + $(AM_V_at)mkdir -p t + $(AM_V_at)rm -f t/run-* + $(MAKE) $(parallel_tests) # Reorder input lines (which are one per test) so that the # longest-running tests appear first in the output. @@ -494,7 +592,7 @@ $(t_run): Makefile db_test # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest # slow_test_regexp = \ - ^t/DBTest\.(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$ + ^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$ prioritize_long_running_tests = \ perl -pe 's,($(slow_test_regexp)),100 $$1,' \ | sort -k1,1gr \ @@ -505,26 +603,43 @@ prioritize_long_running_tests = \ # Run with "make J=200% check" to run two parallel jobs per core. # The default is to run one job per core (J=100%). # See "man parallel" for its "-j ..." option. -J = 100% +J ?= 100% # Use this regexp to select the subset of tests whose names match. tests-regexp = . +t_run = $(wildcard t/run-*) .PHONY: check_0 -check_0: $(t_run) +check_0: + $(AM_V_GEN)export TEST_TMPDIR=$(TMPD); \ + printf '%s\n' '' \ + 'To monitor subtest ,' \ + ' run "make watch-log" in a separate window' ''; \ + test -t 1 && eta=--eta || eta=; \ + { \ + printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)); \ + printf '%s\n' $(t_run); \ + } \ + | $(prioritize_long_running_tests) \ + | grep -E '$(tests-regexp)' \ + | build_tools/gnu_parallel -j$(J) --joblog=LOG $$eta --gnu '{} >& t/log-{/}' + +.PHONY: valgrind_check_0 +valgrind_check_0: $(AM_V_GEN)export TEST_TMPDIR=$(TMPD); \ printf '%s\n' '' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ test -t 1 && eta=--eta || eta=; \ { \ - printf './%s\n' $(filter-out db_test, $(TESTS)); \ + printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS)); \ printf '%s\n' $(t_run); \ } \ | $(prioritize_long_running_tests) \ | grep -E '$(tests-regexp)' \ - | parallel -j$(J) --joblog=LOG $$eta --gnu '{} >& t/log-{/}' -endif + | build_tools/gnu_parallel -j$(J) --joblog=LOG $$eta --gnu \ + 'if [[ "{}" == "./"* ]] ; then $(DRIVER) {} >& t/valgrind_log-{/}; ' \ + 'else {} >& t/valgrind_log-{/}; fi' CLEAN_FILES += t LOG $(TMPD) @@ -541,19 +656,21 @@ watch-log: # If J != 1 and GNU parallel is installed, run the tests in parallel, # via the check_0 rule above. Otherwise, run them sequentially. check: all + $(MAKE) gen_parallel_tests $(AM_V_GEN)if test "$(J)" != 1 \ - && (parallel --gnu --help 2>/dev/null) | \ + && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ grep -q 'GNU Parallel'; \ then \ - t=$$($(test_names)); \ $(MAKE) T="$$t" TMPD=$(TMPD) check_0; \ else \ for t in $(TESTS); do \ echo "===== Running $$t"; ./$$t || exit 1; done; \ fi rm -rf $(TMPD) +ifeq ($(filter -DROCKSDB_LITE,$(OPT)),) python tools/ldb_test.py sh tools/rocksdb_dump_test.sh +endif check_some: $(SUBSET) ldb_tests for t in $(SUBSET); do echo "===== Running $$t"; ./$$t || exit 1; done @@ -565,12 +682,18 @@ ldb_tests: ldb crash_test: whitebox_crash_test blackbox_crash_test blackbox_crash_test: db_stress - python -u tools/db_crashtest.py -s - python -u tools/db_crashtest.py + python -u tools/db_crashtest.py --simple blackbox + python -u tools/db_crashtest.py blackbox + +ifeq ($(CRASH_TEST_KILL_ODD),) + CRASH_TEST_KILL_ODD=888887 +endif whitebox_crash_test: db_stress - python -u tools/db_crashtest2.py -s - python -u tools/db_crashtest2.py + python -u tools/db_crashtest.py --simple whitebox --random_kill_odd \ + $(CRASH_TEST_KILL_ODD) + python -u tools/db_crashtest.py whitebox --random_kill_odd \ + $(CRASH_TEST_KILL_ODD) asan_check: $(MAKE) clean @@ -582,14 +705,80 @@ asan_crash_test: COMPILE_WITH_ASAN=1 $(MAKE) crash_test $(MAKE) clean +ubsan_check: + $(MAKE) clean + COMPILE_WITH_UBSAN=1 $(MAKE) check -j32 + $(MAKE) clean + +ubsan_crash_test: + $(MAKE) clean + COMPILE_WITH_UBSAN=1 $(MAKE) crash_test + $(MAKE) clean + valgrind_check: $(TESTS) - for t in $(filter-out skiplist_test,$(TESTS)); do \ - $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + $(MAKE) gen_parallel_tests + $(AM_V_GEN)if test "$(J)" != 1 \ + && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ + grep -q 'GNU Parallel'; \ + then \ + $(MAKE) TMPD=$(TMPD) \ + DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" valgrind_check_0; \ + else \ + for t in $(filter-out %skiplist_test options_settable_test,$(TESTS)); do \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + ret_code=$$?; \ + if [ $$ret_code -ne 0 ]; then \ + exit $$ret_code; \ + fi; \ + done; \ + fi + + +ifneq ($(PAR_TEST),) +parloop: + ret_bad=0; \ + for t in $(PAR_TEST); do \ + echo "===== Running $$t in parallel $(NUM_PAR)";\ + if [ $(db_test) -eq 1 ]; then \ + seq $(J) | v="$$t" build_tools/gnu_parallel --gnu 's=$(TMPD)/rdb-{}; export TEST_TMPDIR=$$s;' \ + 'timeout 2m ./db_test --gtest_filter=$$v >> $$s/log-{} 2>1'; \ + else\ + seq $(J) | v="./$$t" build_tools/gnu_parallel --gnu 's=$(TMPD)/rdb-{};' \ + 'export TEST_TMPDIR=$$s; timeout 10m $$v >> $$s/log-{} 2>1'; \ + fi; \ ret_code=$$?; \ if [ $$ret_code -ne 0 ]; then \ - exit $$ret_code; \ + ret_bad=$$ret_code; \ + echo $$t exited with $$ret_code; \ fi; \ - done + done; \ + exit $$ret_bad; +endif + +test_names = \ + ./db_test --gtest_list_tests \ + | perl -n \ + -e 's/ *\#.*//;' \ + -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};' \ + -e 'print qq! $$p$$2!' + +parallel_check: $(TESTS) + $(AM_V_GEN)if test "$(J)" > 1 \ + && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ + grep -q 'GNU Parallel'; \ + then \ + echo Running in parallel $(J); \ + else \ + echo "Need to have GNU Parallel and J > 1"; exit 1; \ + fi; \ + ret_bad=0; \ + echo $(J);\ + echo Test Dir: $(TMPD); \ + seq $(J) | build_tools/gnu_parallel --gnu 's=$(TMPD)/rdb-{}; rm -rf $$s; mkdir $$s'; \ + $(MAKE) PAR_TEST="$(shell $(test_names))" TMPD=$(TMPD) \ + J=$(J) db_test=1 parloop; \ + $(MAKE) PAR_TEST="$(filter-out db_test, $(TESTS))" \ + TMPD=$(TMPD) J=$(J) db_test=0 parloop; analyze: clean $(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) \ @@ -611,7 +800,7 @@ unity.a: unity.o $(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o # try compiling db_test with unity -unity_test: db/db_test.o util/db_test_util.o $(TESTHARNESS) unity.a +unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) unity.a $(AM_LINK) ./unity_test @@ -627,7 +816,7 @@ clean: tags: ctags * -R - cscope -b `find . -name '*.cc'` `find . -name '*.h'` + cscope -b `find . -name '*.cc'` `find . -name '*.h'` `find . -name '*.c'` format: build_tools/format-diff.sh @@ -642,7 +831,15 @@ $(LIBRARY): $(LIBOBJECTS) $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS) -db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) +$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) + $(AM_V_AR)rm -f $@ + $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ + +librocksdb_env_basic_test.a: util/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_V_AR)rm -f $@ + $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ + +db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) $(AM_LINK) cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) @@ -651,10 +848,10 @@ cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) memtablerep_bench: db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) -block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) +db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) -db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) +write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) @@ -669,7 +866,7 @@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) +column_family_test: db/column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -690,12 +887,18 @@ cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +hash_table_test: utilities/persistent_cache/hash_table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -711,38 +914,68 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_test: db/db_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_bloom_filter_test: db/db_bloom_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_compaction_filter_test: db/db_compaction_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_log_iter_test: db/db_log_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_compaction_test: db/db_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_compaction_filter_test: db/db_compaction_filter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_dynamic_level_test: db/db_dynamic_level_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_compaction_test: db/db_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_flush_test: db/db_flush_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_dynamic_level_test: db/db_dynamic_level_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_inplace_update_test: db/db_inplace_update_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_iterator_test: db/db_iterator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_tailing_iter_test: db/db_tailing_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_sst_test: db/db_sst_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_universal_compaction_test: db/db_universal_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_wal_test: db/db_wal_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_io_failure_test: db/db_io_failure_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_properties_test: db/db_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_table_properties_test: db/db_table_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) - $(AM_LINK) $(pg) + $(AM_LINK) $(PROFILING_FLAGS) plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -751,7 +984,7 @@ comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) - $(AM_LINK) $(pg) + $(AM_LINK) $(PROFILING_FLAGS) perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) @@ -771,9 +1004,23 @@ document_db_test: utilities/document/document_db_test.o $(LIBOBJECTS) $(TESTHARN json_document_test: utilities/document/json_document_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +env_mirror_test: utilities/env_mirror_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +ifdef ROCKSDB_USE_LIBRADOS +env_librados_test: utilities/env_librados_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +endif + +env_registry_test: utilities/env_registry_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -801,6 +1048,9 @@ wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +env_basic_test: util/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -834,6 +1084,9 @@ table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +inlineskiplist_test: db/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -864,12 +1117,21 @@ write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) merge_helper_test: db/merge_helper_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +memory_test: utilities/memory/memory_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +util_merge_operators_test: utilities/util_merge_operators_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -891,7 +1153,7 @@ cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTH cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) +listener_test: db/listener_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -903,13 +1165,19 @@ compact_files_test: db/compact_files_test.o $(LIBOBJECTS) $(TESTHARNESS) options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +options_settable_test: util/options_settable_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +options_util_test: utilities/options/options_util_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS) $(AM_LINK) -sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) +event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -memenv_test : util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS) +sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) optimistic_transaction_test: utilities/transactions/optimistic_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -918,13 +1186,13 @@ optimistic_transaction_test: utilities/transactions/optimistic_transaction_test. mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) +manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +auto_roll_logger_test: db/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -942,12 +1210,24 @@ transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TEST sst_dump: tools/sst_dump.o $(LIBOBJECTS) $(AM_LINK) -ldb_cmd_test: util/ldb_cmd_test.o $(LIBOBJECTS) $(TESTHARNESS) +repair_test: db/repair_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +ldb_cmd_test: tools/ldb_cmd_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) ldb: tools/ldb.o $(LIBOBJECTS) $(AM_LINK) +iostats_context_test: util/iostats_context_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +statistics_test: util/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local @@ -990,7 +1270,11 @@ install: install-static # --------------------------------------------------------------------------- JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux -ARCH := $(shell getconf LONG_BIT) +ifeq ($(PLATFORM), OS_SOLARIS) + ARCH := $(shell isainfo -b) +else + ARCH := $(shell getconf LONG_BIT) +endif ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar @@ -998,14 +1282,19 @@ ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PA ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar ifeq ($(PLATFORM), OS_MACOSX) -ROCKSDBJNILIB = librocksdbjni-osx.jnilib -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar + ROCKSDBJNILIB = librocksdbjni-osx.jnilib + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","") JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin else JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ endif endif +ifeq ($(PLATFORM), OS_SOLARIS) + ROCKSDBJNILIB = librocksdbjni-solaris$(ARCH).so + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-solaris$(ARCH).jar + JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/solaris +endif libz.a: -rm -rf zlib-1.2.8 @@ -1037,20 +1326,23 @@ liblz4.a: cd lz4-r127/lib && make CFLAGS='-fPIC' all cp lz4-r127/lib/liblz4.a . -# A version of each $(LIBOBJECTS) compiled with -fPIC -java_libobjects = $(patsubst %,jl/%,$(LIBOBJECTS)) -CLEAN_FILES += jl +# A version of each $(LIBOBJECTS) compiled with -fPIC and a fixed set of static compression libraries +java_static_libobjects = $(patsubst %,jls/%,$(LIBOBJECTS)) +CLEAN_FILES += jls -$(java_libobjects): jl/%.o: %.cc - $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS) +JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 +JAVA_STATIC_INCLUDES = -I./zlib-1.2.8 -I./bzip2-1.0.6 -I./snappy-1.1.1 -I./lz4-r127/lib + +$(java_static_libobjects): jls/%.o: %.cc libz.a libbz2.a libsnappy.a liblz4.a + $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -fPIC -c $< -o $@ $(COVERAGEFLAGS) -rocksdbjavastatic: $(java_libobjects) libz.a libbz2.a libsnappy.a liblz4.a +rocksdbjavastatic: $(java_static_libobjects) cd java;$(MAKE) javalib; rm -f ./java/target/$(ROCKSDBJNILIB) $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \ -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \ - $(java_libobjects) $(COVERAGEFLAGS) \ - libz.a libbz2.a libsnappy.a liblz4.a $(LDFLAGS) + $(java_static_libobjects) $(COVERAGEFLAGS) \ + libz.a libbz2.a libsnappy.a liblz4.a $(JAVA_STATIC_LDFLAGS) cd java/target;strip -S -x $(ROCKSDBJNILIB) cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) @@ -1061,7 +1353,7 @@ rocksdbjavastatic: $(java_libobjects) libz.a libbz2.a libsnappy.a liblz4.a rocksdbjavastaticrelease: rocksdbjavastatic cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md - cd java;jar -uf target/$(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib + cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class rocksdbjavastaticpublish: rocksdbjavastaticrelease @@ -1072,6 +1364,13 @@ rocksdbjavastaticpublish: rocksdbjavastaticrelease mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar +# A version of each $(LIBOBJECTS) compiled with -fPIC +java_libobjects = $(patsubst %,jl/%,$(LIBOBJECTS)) +CLEAN_FILES += jl + +$(java_libobjects): jl/%.o: %.cc + $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS) + rocksdbjava: $(java_libobjects) $(AM_V_GEN)cd java;$(MAKE) javalib; $(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB) @@ -1083,17 +1382,22 @@ rocksdbjava: $(java_libobjects) jclean: cd java;$(MAKE) clean; +jtest_compile: rocksdbjava + cd java;$(MAKE) java_test + +jtest_run: + cd java;$(MAKE) run_test + jtest: rocksdbjava cd java;$(MAKE) sample;$(MAKE) test; jdb_bench: cd java;$(MAKE) db_bench; -commit-prereq: - $(MAKE) clean && $(MAKE) all check; +commit_prereq: build_tools/rocksdb-lego-determinator \ + build_tools/precommit_checker.py + J=$(J) build_tools/precommit_checker.py unit unit_481 clang_unit release release_481 clang_release tsan asan ubsan lite unit_non_shm $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava; - $(MAKE) clean && USE_CLANG=1 $(MAKE) all; - $(MAKE) clean && OPT=-DROCKSDB_LITE $(MAKE) static_lib; xfunc: for xftest in $(XFUNC_TESTS); do \ @@ -1140,7 +1444,7 @@ endif # Source files dependencies detection # --------------------------------------------------------------------------- -all_sources = $(LIB_SOURCES) $(TEST_BENCH_SOURCES) $(MOCK_SOURCES) +all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) DEPFILES = $(all_sources:.cc=.d) # Add proper dependency support so changing a .h file forces a .cc file to diff --git a/external/rocksdb/README.md b/external/rocksdb/README.md index 916bdecdee..550c352b88 100644 --- a/external/rocksdb/README.md +++ b/external/rocksdb/README.md @@ -1,6 +1,8 @@ ## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) +[![Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/master?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/master) + RocksDB is developed and maintained by Facebook Database Engineering Team. It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) diff --git a/external/rocksdb/USERS.md b/external/rocksdb/USERS.md index 386a23dcc5..890f3399b4 100644 --- a/external/rocksdb/USERS.md +++ b/external/rocksdb/USERS.md @@ -1,15 +1,21 @@ This document lists users of RocksDB and their use cases. If you are using RocksDB, please open a pull request and add yourself to the list. ## Facebook -At Facebook, we use RocksDB as a backend for many different stateful services. We're also experimenting with running RocksDB as a storage engine for two databases: +At Facebook, we use RocksDB as storage engines in multiple data management services and a backend for many different stateful services, including: 1. MyRocks -- https://github.com/MySQLOnRocksDB/mysql-5.6 2. MongoRocks -- https://github.com/mongodb-partners/mongo-rocks +3. ZippyDB -- Facebook's distributed key-value store with Paxos-style replication, built on top of RocksDB.[*] https://www.youtube.com/watch?v=DfiN7pG0D0khtt +4. Laser -- Laser is a high query throughput, low (millisecond) latency, key-value storage service built on top of RocksDB.[*] +4. Dragan -- a distributed graph query engine. https://code.facebook.com/posts/1737605303120405/dragon-a-distributed-graph-query-engine/ +5. Stylus -- a low-level stream processing framework writtenin C++.[*] + +[*] https://research.facebook.com/publications/realtime-data-processing-at-facebook/ ## LinkedIn Two different use cases at Linkedin are using RocksDB as a storage engine: -1. LinkedIn's follow feed for storing user's activities +1. LinkedIn's follow feed for storing user's activities. Check out the blog post: https://engineering.linkedin.com/blog/2016/03/followfeed--linkedin-s-feed-made-faster-and-smarter 2. Apache Samza, open source framework for stream processing Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasundaram: http://www.youtube.com/watch?v=plqVp_OnSzg @@ -37,3 +43,31 @@ Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/roc ## Santanader UK/Cloudera Profession Services Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/ + +## Airbnb +Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs + +## Pinterest +Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo + +## Smyte +[Smyte](https://www.smyte.com/) uses RocksDB as the storage layer for their core key-value storage, high-performance counters and time-windowed HyperLogLog services. + +## Rakuten Marketing +[Rakuten Marketing](https://marketing.rakuten.com/) uses RocksDB as the disk cache layer for the real-time bidding service in their Performance DSP. + +## VWO, Wingify +[VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed. + +## quasardb +[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. +quasardb uses a heavily tuned RocksDB as its persistence layer. + +## Netflix +[Netflix](http://techblog.netflix.com/2016/05/application-data-caching-using-ssds.html) uses RocksDB on spinning disks to cache application data. + +## TiKV +[TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer. + +## Apache Flink +[Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine. diff --git a/external/rocksdb/appveyor.yml b/external/rocksdb/appveyor.yml index e13e2d2260..aeb8b948d5 100644 --- a/external/rocksdb/appveyor.yml +++ b/external/rocksdb/appveyor.yml @@ -2,10 +2,15 @@ version: 1.0.{build} before_build: - md %APPVEYOR_BUILD_FOLDER%\build - cd %APPVEYOR_BUILD_FOLDER%\build -- cmake -G "Visual Studio 12 Win64" .. +- cmake -G "Visual Studio 12 Win64" -DOPTDBG=1 -DXPRESS=1 .. - cd .. build: - project: build\ALL_BUILD.vcxproj + project: build\rocksdb.sln parallel: true verbosity: minimal -test: off +test: +test_script: +- ps: build_tools\run_ci_db_test.ps1 -EnableRerun -Run db_test2 -Concurrency 8 +- ps: build_tools\run_ci_db_test.ps1 -EnableRerun -Run db_test -Exclude DBTest.GroupCommitTest -Concurrency 10 +- ps: build_tools\run_ci_db_test.ps1 -Run env_test -Concurrency 1 + diff --git a/external/rocksdb/appveyordailytests.yml b/external/rocksdb/appveyordailytests.yml deleted file mode 100644 index a8b4af60cf..0000000000 --- a/external/rocksdb/appveyordailytests.yml +++ /dev/null @@ -1,22 +0,0 @@ -version: 1.0.{build} -before_build: -- md %APPVEYOR_BUILD_FOLDER%\build -- cd %APPVEYOR_BUILD_FOLDER%\build -- cmake -G "Visual Studio 12 Win64" -DOPTDBG=1 .. -- cd .. -build: - project: build\ALL_BUILD.vcxproj - parallel: true - verbosity: minimal -test: -test_script: -- ps: build_tools\run_ci_db_test.ps1 -notifications: - - provider: Email - to: - - svmtrocksdb@microsoft.com - subject: "Build {{status}}" - message: "{{message}}, {{commitId}}, ..." - on_build_success: false - on_build_failure: true - on_build_status_changed: true diff --git a/external/rocksdb/arcanist_util/__phutil_library_map__.php b/external/rocksdb/arcanist_util/__phutil_library_map__.php index 274ad16e3a..e3117382b4 100644 --- a/external/rocksdb/arcanist_util/__phutil_library_map__.php +++ b/external/rocksdb/arcanist_util/__phutil_library_map__.php @@ -6,33 +6,66 @@ * @phutil-library-version 2 */ -phutil_register_library_map(array( - '__library_version__' => 2, - 'class' => - array( - 'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php', - 'BaseDirectoryScopedFormatLinter' => 'cpp_linter/BaseDirectoryScopedFormatLinter.php', - 'FacebookArcanistConfiguration' => 'config/FacebookArcanistConfiguration.php', - 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', - 'FacebookFbcodeUnitTestEngine' => 'unit_engine/FacebookFbcodeUnitTestEngine.php', - 'FacebookHowtoevenLintEngine' => 'lint_engine/FacebookHowtoevenLintEngine.php', - 'FacebookHowtoevenLinter' => 'cpp_linter/FacebookHowtoevenLinter.php', - 'FbcodeClangFormatLinter' => 'cpp_linter/FbcodeClangFormatLinter.php', - 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', - ), - 'function' => - array( - ), - 'xmap' => - array( - 'ArcanistCpplintLinter' => 'ArcanistLinter', - 'BaseDirectoryScopedFormatLinter' => 'ArcanistLinter', - 'FacebookArcanistConfiguration' => 'ArcanistConfiguration', - 'FacebookFbcodeLintEngine' => 'ArcanistLintEngine', - 'FacebookFbcodeUnitTestEngine' => 'ArcanistBaseUnitTestEngine', - 'FacebookHowtoevenLintEngine' => 'ArcanistLintEngine', - 'FacebookHowtoevenLinter' => 'ArcanistLinter', - 'FbcodeClangFormatLinter' => 'BaseDirectoryScopedFormatLinter', - 'FbcodeCppLinter' => 'ArcanistLinter', - ), -)); +if (class_exists('ArcanistWorkflow')) { + phutil_register_library_map(array( + '__library_version__' => 2, + 'class' => + array( + 'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php', + 'BaseDirectoryScopedFormatLinter' => 'cpp_linter/BaseDirectoryScopedFormatLinter.php', + 'FacebookArcanistConfiguration' => 'config/FacebookArcanistConfiguration.php', + 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', + 'FacebookFbcodeUnitTestEngine' => 'unit_engine/FacebookFbcodeUnitTestEngine.php', + 'FacebookHowtoevenLintEngine' => 'lint_engine/FacebookHowtoevenLintEngine.php', + 'FacebookHowtoevenLinter' => 'cpp_linter/FacebookHowtoevenLinter.php', + 'FbcodeClangFormatLinter' => 'cpp_linter/FbcodeClangFormatLinter.php', + 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', + ), + 'function' => + array( + ), + 'xmap' => + array( + 'ArcanistCpplintLinter' => 'ArcanistLinter', + 'BaseDirectoryScopedFormatLinter' => 'ArcanistLinter', + 'FacebookArcanistConfiguration' => 'ArcanistConfiguration', + 'FacebookFbcodeLintEngine' => 'ArcanistLintEngine', + 'FacebookFbcodeUnitTestEngine' => 'ArcanistBaseUnitTestEngine', + 'FacebookHowtoevenLintEngine' => 'ArcanistLintEngine', + 'FacebookHowtoevenLinter' => 'ArcanistLinter', + 'FbcodeClangFormatLinter' => 'BaseDirectoryScopedFormatLinter', + 'FbcodeCppLinter' => 'ArcanistLinter', + ), + )); +} else { + phutil_register_library_map(array( + '__library_version__' => 2, + 'class' => + array( + 'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php', + 'BaseDirectoryScopedFormatLinter' => 'cpp_linter/BaseDirectoryScopedFormatLinter.php', + 'FacebookArcanistConfiguration' => 'config/FacebookOldArcanistConfiguration.php', + 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', + 'FacebookFbcodeUnitTestEngine' => 'unit_engine/FacebookFbcodeUnitTestEngine.php', + 'FacebookHowtoevenLintEngine' => 'lint_engine/FacebookHowtoevenLintEngine.php', + 'FacebookHowtoevenLinter' => 'cpp_linter/FacebookHowtoevenLinter.php', + 'FbcodeClangFormatLinter' => 'cpp_linter/FbcodeClangFormatLinter.php', + 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', + ), + 'function' => + array( + ), + 'xmap' => + array( + 'ArcanistCpplintLinter' => 'ArcanistLinter', + 'BaseDirectoryScopedFormatLinter' => 'ArcanistLinter', + 'FacebookArcanistConfiguration' => 'ArcanistConfiguration', + 'FacebookFbcodeLintEngine' => 'ArcanistLintEngine', + 'FacebookFbcodeUnitTestEngine' => 'ArcanistBaseUnitTestEngine', + 'FacebookHowtoevenLintEngine' => 'ArcanistLintEngine', + 'FacebookHowtoevenLinter' => 'ArcanistLinter', + 'FbcodeClangFormatLinter' => 'BaseDirectoryScopedFormatLinter', + 'FbcodeCppLinter' => 'ArcanistLinter', + ), + )); +} diff --git a/external/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php b/external/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php index c3454903b6..d82f9a8734 100644 --- a/external/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php +++ b/external/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php @@ -4,32 +4,33 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +require('RocksDBCommonHelper.php'); + +define("DIFF_COMMAND", "diff"); + class FacebookArcanistConfiguration extends ArcanistConfiguration { public function didRunWorkflow($command, - ArcanistBaseWorkflow $workflow, + ArcanistWorkflow $workflow, $error_code) { - if ($command == 'diff' && !$workflow->isRawDiffSource()) { - $this->maybePushToJenkins($workflow); - } - } + // Default options don't terminate on failure, but that's what we want. In + // the current case we use assertions intentionally as "terminate on failure + // invariants". + assert_options(ASSERT_BAIL, true); - ////////////////////////////////////////////////////////////////////// - /* Send off builds to jenkins */ - function maybePushToJenkins($workflow) { - $diffID = $workflow->getDiffID(); - if ($diffID === null) { - return; - } + assert($workflow); + assert(strlen($command) > 0); - $results = $workflow->getTestResults(); - if (!$results) { - return; - } + if ($command == DIFF_COMMAND && !$workflow->isRawDiffSource()) { + $diffID = $workflow->getDiffId(); - $url = "https://ci-builds.fb.com/view/rocksdb/job/rocksdb_diff_check/" - ."buildWithParameters?token=AUTH&DIFF_ID=$diffID"; - system("curl --noproxy '*' \"$url\" > /dev/null 2>&1"); + // When submitting a diff this code path gets executed multiple times in + // a row. We only care about the case when ID for the diff is provided + // because that's what we need to apply the diff and trigger the tests. + if (strlen($diffID) > 0) { + assert(is_numeric($diffID)); + startTestsInSandcastle(true /* $applyDiff */, $workflow, $diffID); + } + } } - } diff --git a/external/rocksdb/arcanist_util/config/FacebookOldArcanistConfiguration.php b/external/rocksdb/arcanist_util/config/FacebookOldArcanistConfiguration.php new file mode 100644 index 0000000000..aaab040dd1 --- /dev/null +++ b/external/rocksdb/arcanist_util/config/FacebookOldArcanistConfiguration.php @@ -0,0 +1,36 @@ + 0); + + if ($command == DIFF_COMMAND && !$workflow->isRawDiffSource()) { + $diffID = $workflow->getDiffId(); + + // When submitting a diff this code path gets executed multiple times in + // a row. We only care about the case when ID for the diff is provided + // because that's what we need to apply the diff and trigger the tests. + if (strlen($diffID) > 0) { + assert(is_numeric($diffID)); + startTestsInSandcastle(true /* $applyDiff */, $workflow, $diffID); + } + } + } +} diff --git a/external/rocksdb/arcanist_util/config/RocksDBCommonHelper.php b/external/rocksdb/arcanist_util/config/RocksDBCommonHelper.php new file mode 100644 index 0000000000..b3adf240cd --- /dev/null +++ b/external/rocksdb/arcanist_util/config/RocksDBCommonHelper.php @@ -0,0 +1,328 @@ + 0); + assert(is_numeric($diffID)); + assert(strlen($url) > 0); + + $cmd = 'echo \'{"diff_id": "' . $diffID . '", ' + . '"name":"click here for sandcastle tests for D' . $diffID . '", ' + . '"link":"' . $url . '"}\' | ' + . 'http_proxy=fwdproxy.any.facebook.com:8080 ' + . 'https_proxy=fwdproxy.any.facebook.com:8080 arc call-conduit ' + . 'differential.updateunitresults'; + shell_exec($cmd); +} + +function buildUpdateTestStatusCmd($diffID, $test, $status) { + assert(strlen($diffID) > 0); + assert(is_numeric($diffID)); + assert(strlen($test) > 0); + assert(strlen($status) > 0); + + $cmd = 'echo \'{"diff_id": "' . $diffID . '", ' + . '"name":"' . $test . '", ' + . '"result":"' . $status . '"}\' | ' + . 'http_proxy=fwdproxy.any.facebook.com:8080 ' + . 'https_proxy=fwdproxy.any.facebook.com:8080 arc call-conduit ' + . 'differential.updateunitresults'; + return $cmd; +} + +function updateTestStatus($diffID, $test) { + assert(strlen($diffID) > 0); + assert(is_numeric($diffID)); + assert(strlen($test) > 0); + + shell_exec(buildUpdateTestStatusCmd($diffID, $test, "waiting")); +} + +function getSteps($applyDiff, $diffID, $username, $test) { + assert(strlen($username) > 0); + assert(strlen($test) > 0); + + if ($applyDiff) { + assert(strlen($diffID) > 0); + assert(is_numeric($diffID)); + + $arcrc_content = exec("cat ~/.arcrc | gzip -f | base64 -w0"); + assert(strlen($arcrc_content) > 0); + + // Sandcastle machines don't have arc setup. We copy the user certificate + // and authenticate using that in Sandcastle. + $setup = array( + "name" => "Setup arcrc", + "shell" => "echo " . $arcrc_content . " | base64 --decode" + . " | gzip -d > ~/.arcrc", + "user" => "root" + ); + + // arc demands certain permission on its config. + // also fix the sticky bit issue in sandcastle + $fix_permission = array( + "name" => "Fix environment", + "shell" => "chmod 600 ~/.arcrc && chmod +t /dev/shm", + "user" => "root" + ); + + // Construct the steps in the order of execution. + $steps[] = $setup; + $steps[] = $fix_permission; + } + + // fbcode is a sub-repo. We cannot patch until we add it to ignore otherwise + // Git thinks it is an uncommited change. + $fix_git_ignore = array( + "name" => "Fix git ignore", + "shell" => "echo fbcode >> .git/info/exclude", + "user" => "root" + ); + + $steps[] = $fix_git_ignore; + + // This will be the command used to execute particular type of tests. + $cmd = ""; + + if ($applyDiff) { + // Patch the code (keep your fingures crossed). + $patch = array( + "name" => "Patch " . $diffID, + "shell" => "HTTPS_PROXY=fwdproxy:8080 arc --arcrc-file ~/.arcrc " + . "patch --nocommit --diff " . $diffID, + "user" => "root" + ); + + $steps[] = $patch; + + updateTestStatus($diffID, $test); + $cmd = buildUpdateTestStatusCmd($diffID, $test, "running") . "; "; + } + + // Run the actual command. + $cmd = $cmd . "J=$(nproc) ./build_tools/precommit_checker.py " . $test + . "; exit_code=$?; "; + + if ($applyDiff) { + $cmd = $cmd . "([[ \$exit_code -eq 0 ]] &&" + . buildUpdateTestStatusCmd($diffID, $test, "pass") . ")" + . "||" . buildUpdateTestStatusCmd($diffID, $test, "fail") + . "; "; + } + + $cmd = $cmd . " cat /tmp/precommit-check.log" + . "; for f in `ls t/log-*`; do echo \$f; cat \$f; done;" + . "[[ \$exit_code -eq 0 ]]"; + assert(strlen($cmd) > 0); + + $run_test = array( + "name" => "Run " . $test, + "shell" => $cmd, + "user" => "root", + "parser" => "python build_tools/error_filter.py " . $test, + ); + + $steps[] = $run_test; + + if ($applyDiff) { + // Clean up the user arc config we are using. + $cleanup = array( + "name" => "Arc cleanup", + "shell" => "rm -f ~/.arcrc", + "user" => "root" + ); + + $steps[] = $cleanup; + } + + assert(count($steps) > 0); + return $steps; +} + +function getSandcastleConfig() { + $sandcastle_config = array(); + + // This is a case when we're executed from a continuous run. Fetch the values + // from the environment. + if (getenv(ENV_POST_RECEIVE_HOOK)) { + $sandcastle_config[0] = getenv(ENV_HTTPS_APP_VALUE); + $sandcastle_config[1] = getenv(ENV_HTTPS_TOKEN_VALUE); + } else { + // This is a typical `[p]arc diff` case. Fetch the values from the specific + // configuration files. + assert(file_exists(PRIMARY_TOKEN_FILE) || + file_exists(SECONDARY_TOKEN_FILE)); + + // Try the primary location first, followed by a secondary. + if (file_exists(PRIMARY_TOKEN_FILE)) { + $cmd = 'cat ' . PRIMARY_TOKEN_FILE; + } else { + $cmd = 'cat ' . SECONDARY_TOKEN_FILE; + } + + assert(strlen($cmd) > 0); + $sandcastle_config = explode(':', rtrim(shell_exec($cmd))); + } + + // In this case be very explicit about the implications. + if (count($sandcastle_config) != 2) { + echo "Sandcastle configuration files don't contain valid information " . + "or the necessary environment variables aren't defined. Unable " . + "to validate the code changes."; + exit(1); + } + + assert(strlen($sandcastle_config[0]) > 0); + assert(strlen($sandcastle_config[1]) > 0); + assert(count($sandcastle_config) > 0); + + return $sandcastle_config; +} + +// This function can be called either from `[p]arc diff` command or during +// the Git post-receive hook. + function startTestsInSandcastle($applyDiff, $workflow, $diffID) { + // Default options don't terminate on failure, but that's what we want. In + // the current case we use assertions intentionally as "terminate on failure + // invariants". + assert_options(ASSERT_BAIL, true); + + // In case of a diff we'll send notificatios to the author. Else it'll go to + // the entire team because failures indicate that build quality has regressed. + $username = $applyDiff ? exec("whoami") : CONT_RUN_ALIAS; + assert(strlen($username) > 0); + + if ($applyDiff) { + assert($workflow); + assert(strlen($diffID) > 0); + assert(is_numeric($diffID)); + } + + if (strcmp(getenv("ROCKSDB_CHECK_ALL"), 1) == 0) { + // Extract all tests from the CI definition. + $output = file_get_contents("build_tools/rocksdb-lego-determinator"); + assert(strlen($output) > 0); + + preg_match_all('/[ ]{2}([a-zA-Z0-9_]+)[\)]{1}/', $output, $matches); + $tests = $matches[1]; + assert(count($tests) > 0); + } else { + // Manually list of tests we want to run in Sandcastle. + $tests = array( + "unit", "unit_non_shm", "unit_481", "clang_unit", "tsan", "asan", + "lite_test", "valgrind", "release", "release_481", "clang_release" + ); + } + + $send_email_template = array( + 'type' => 'email', + 'triggers' => array('fail'), + 'emails' => array($username . '@fb.com'), + ); + + // Construct a job definition for each test and add it to the master plan. + foreach ($tests as $test) { + $stepName = "RocksDB diff " . $diffID . " test " . $test; + + if (!$applyDiff) { + $stepName = "RocksDB continuous integration test " . $test; + } + + $arg[] = array( + "name" => $stepName, + "report" => array($send_email_template), + "steps" => getSteps($applyDiff, $diffID, $username, $test) + ); + } + + // We cannot submit the parallel execution master plan to Sandcastle and + // need supply the job plan as a determinator. So we construct a small job + // that will spit out the master job plan which Sandcastle will parse and + // execute. Why compress the job definitions? Otherwise we run over the max + // string size. + $cmd = "echo " . base64_encode(json_encode($arg)) + . " | gzip -f | base64 -w0"; + assert(strlen($cmd) > 0); + + $arg_encoded = shell_exec($cmd); + assert(strlen($arg_encoded) > 0); + + $runName = "Run diff " . $diffID . "for user " . $username; + + if (!$applyDiff) { + $runName = "RocksDB continuous integration build and test run"; + } + + $command = array( + "name" => $runName, + "steps" => array() + ); + + $command["steps"][] = array( + "name" => "Generate determinator", + "shell" => "echo " . $arg_encoded . " | base64 --decode | gzip -d" + . " | base64 --decode", + "determinator" => true, + "user" => "root" + ); + + // Submit to Sandcastle. + $url = 'https://interngraph.intern.facebook.com/sandcastle/generate?' + .'command=SandcastleUniversalCommand' + .'&vcs=rocksdb-git&revision=origin%2Fmaster&type=lego' + .'&user=' . $username . '&alias=rocksdb-precommit' + .'&command-args=' . urlencode(json_encode($command)); + + // Fetch the configuration necessary to submit a successful HTTPS request. + $sandcastle_config = getSandcastleConfig(); + + $app = $sandcastle_config[0]; + $token = $sandcastle_config[1]; + + $cmd = 'https_proxy= HTTPS_PROXY= curl -s -k -F app=' . $app . ' ' + . '-F token=' . $token . ' "' . $url . '"'; + + $output = shell_exec($cmd); + assert(strlen($output) > 0); + + // Extract Sandcastle URL from the response. + preg_match('/url": "(.+)"/', $output, $sandcastle_url); + + assert(count($sandcastle_url) > 0, "Unable to submit Sandcastle request."); + assert(strlen($sandcastle_url[1]) > 0, "Unable to extract Sandcastle URL."); + + if ($applyDiff) { + echo "\nSandcastle URL: " . $sandcastle_url[1] . "\n"; + // Ask Phabricator to display it on the diff UI. + postURL($diffID, $sandcastle_url[1]); + } else { + echo "Continuous integration started Sandcastle tests. You can look at "; + echo "the progress at:\n" . $sandcastle_url[1] . "\n"; + } +} + +// Continuous run cript will set the environment variable and based on that +// we'll trigger the execution of tests in Sandcastle. In that case we don't +// need to apply any diffs and there's no associated workflow either. +if (getenv(ENV_POST_RECEIVE_HOOK)) { + startTestsInSandcastle( + false /* $applyDiff */, + NULL /* $workflow */, + NULL /* $diffID */); +} diff --git a/external/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php b/external/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php index 79966e78dd..4a7b307dc8 100644 --- a/external/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php +++ b/external/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php @@ -44,7 +44,7 @@ final public function willLintPaths(array $paths) { $futures[$path] = $this->getFormatFuture($path, $changed); } - foreach (Futures($futures)->limit(8) as $p => $f) { + foreach (id(new FutureIterator($futures))->limit(8) as $p => $f) { $this->rawLintOutput[$p] = $f->resolvex(); } } diff --git a/external/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php b/external/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php index 66eefa0049..3dac9bf73e 100644 --- a/external/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php +++ b/external/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php @@ -88,6 +88,9 @@ public function getLintNameMap() { } private function getCppLintOutput($path) { + if (!array_key_exists($path, $this->rawLintOutput)) { + return array(); + } list($output) = $this->rawLintOutput[$path]; $msgs = array(); diff --git a/external/rocksdb/arcanist_util/cpp_linter/cpplint.py b/external/rocksdb/arcanist_util/cpp_linter/cpplint.py index d6201945ae..3d0c45a6dd 100644 --- a/external/rocksdb/arcanist_util/cpp_linter/cpplint.py +++ b/external/rocksdb/arcanist_util/cpp_linter/cpplint.py @@ -1,5 +1,5 @@ #!/usr/bin/python -# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. @@ -2714,7 +2714,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # Look for < that is not surrounded by spaces. This is only # triggered if both sides are missing spaces, even though - # technically should should flag if at least one side is missing a + # technically should flag if at least one side is missing a # space. This is done to avoid some false positives with shifts. match = Search(r'[^\s<]<([^\s=<].*)', reduced_line) if (match and diff --git a/external/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php b/external/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php index 7b12cccdd8..88b0748f7d 100644 --- a/external/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php +++ b/external/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php @@ -42,14 +42,12 @@ public function buildLinters() { $python_linter = new ArcanistPEP8Linter(); $linters[] = $python_linter; - if (!$this->getCommitHookMode()) { - $cpp_linters = array(); - $cpp_linters[] = $linters[] = new ArcanistCpplintLinter(); - $cpp_linters[] = $linters[] = new FbcodeCppLinter(); + $cpp_linters = array(); + $cpp_linters[] = $linters[] = new ArcanistCpplintLinter(); + $cpp_linters[] = $linters[] = new FbcodeCppLinter(); - $clang_format_linter = new FbcodeClangFormatLinter(); - $linters[] = $clang_format_linter; - } + $clang_format_linter = new FbcodeClangFormatLinter(); + $linters[] = $clang_format_linter; $spelling_linter = new ArcanistSpellingLinter(); $linters[] = $spelling_linter; diff --git a/external/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php b/external/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php index f9a9e70e5e..985bd68fc2 100644 --- a/external/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php +++ b/external/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php @@ -7,15 +7,11 @@ class FacebookFbcodeUnitTestEngine extends ArcanistBaseUnitTestEngine { public function run() { - // Here we create a new unit test "jenkins_async_test" and promise we'll - // update the results later. - // Jenkins updates the results using `arc call-conduit - // differential.updateunitresults` call. If you change the name here, also - // make sure to change the name in Jenkins script that updates the test - // result -- they have to be the same. - $result = new ArcanistUnitTestResult(); - $result->setName("jenkins_async_test"); - $result->setResult(ArcanistUnitTestResult::RESULT_POSTPONED); - return array($result); + // For a call to `arc call-conduit differential.updateunitresults` to + // succeed we need at least one entry here. + $result = new ArcanistUnitTestResult(); + $result->setName("dummy_placeholder_entry"); + $result->setResult(ArcanistUnitTestResult::RESULT_PASS); + return array($result); } } diff --git a/external/rocksdb/build_tools/build_detect_platform b/external/rocksdb/build_tools/build_detect_platform index 0e40ac52fe..afff3794f0 100755 --- a/external/rocksdb/build_tools/build_detect_platform +++ b/external/rocksdb/build_tools/build_detect_platform @@ -8,6 +8,7 @@ # CXX C++ Compiler path # PLATFORM_LDFLAGS Linker flags # JAVA_LDFLAGS Linker flags for RocksDBJava +# JAVA_STATIC_LDFLAGS Linker flags for RocksDBJava static build # PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_LDFLAGS Flags for building shared library # PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library @@ -44,7 +45,7 @@ fi # we depend on C++11 PLATFORM_CXXFLAGS="-std=c++11" # we currently depend on POSIX platform -COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" +COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX" # Default to fbcode gcc on internal fb machines if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then @@ -181,23 +182,26 @@ esac PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" JAVA_LDFLAGS="$PLATFORM_LDFLAGS" +JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS" if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then # Cross-compiling; do not try any compilation tests. # Also don't need any compilation tests if compiling on fbcode true else - # Test whether fallocate is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < - #include - int main() { - int fd = open("/dev/null", 0); - fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024); - } + if ! test $ROCKSDB_DISABLE_FALLOCATE; then + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + #include + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024); + } EOF - if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi fi # Test whether Snappy library is installed @@ -284,7 +288,7 @@ EOF # Test whether numa is available $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null < - #inlcude + #include int main() {} EOF if [ "$?" = 0 ]; then @@ -295,13 +299,14 @@ EOF # Test whether jemalloc is available if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \ - 2>/dev/null; then + 2>/dev/null; then PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ljemalloc" JAVA_LDFLAGS="$JAVA_LDFLAGS -ljemalloc" + JEMALLOC=1 else # jemalloc is not available. Let's try tcmalloc if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \ - -ltcmalloc 2>/dev/null; then + -ltcmalloc 2>/dev/null; then PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc" JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc" fi @@ -318,6 +323,55 @@ EOF if [ "$?" = 0 ]; then COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE" fi + + # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int x = PTHREAD_MUTEX_ADAPTIVE_NP; + return 0; + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX" + fi + + # Test whether backtrace is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <> + int main() { + void* frames[1]; + backtrace_symbols(frames, backtrace(frames, 1)); + return 0; + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE" + else + # Test whether execinfo library is installed + $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null < + int main() { + void* frames[1]; + backtrace_symbols(frames, backtrace(frames, 1)); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lexecinfo" + JAVA_LDFLAGS="$JAVA_LDFLAGS -lexecinfo" + fi + fi + + # Test if -pg is supported + $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null <> "$OUTPUT" echo "PLATFORM=$PLATFORM" >> "$OUTPUT" echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT" echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT" echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT" echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT" echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT" @@ -389,3 +452,7 @@ echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT" echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT" echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT" echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT" +echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT" +if test -n "$JEMALLOC"; then + echo "JEMALLOC=1" >> "$OUTPUT" +fi diff --git a/external/rocksdb/build_tools/fbcode_config.sh b/external/rocksdb/build_tools/fbcode_config.sh index 572c0fe685..1c2416c913 100755 --- a/external/rocksdb/build_tools/fbcode_config.sh +++ b/external/rocksdb/build_tools/fbcode_config.sh @@ -6,103 +6,105 @@ # Environment variables that change the behavior of this script: # PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included + +BASEDIR=`dirname $BASH_SOURCE` +source "$BASEDIR/dependencies.sh" + CFLAGS="" -# location of libgcc -LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e" +# libgcc LIBGCC_INCLUDE="$LIBGCC_BASE/include" -LIBGCC_LIBS=" -L $LIBGCC_BASE/libs" - -# location of glibc -GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720 -GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include" -GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib" +LIBGCC_LIBS=" -L $LIBGCC_BASE/lib" -SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/" +# glibc +GLIBC_INCLUDE="$GLIBC_BASE/include" +GLIBC_LIBS=" -L $GLIBC_BASE/lib" +# snappy +SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/" if test -z $PIC_BUILD; then - SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a" + SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a" else - SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a" + SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a" fi - CFLAGS+=" -DSNAPPY" if test -z $PIC_BUILD; then # location of zlib headers and libraries - ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/" - ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a" + ZLIB_INCLUDE=" -I $ZLIB_BASE/include/" + ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a" CFLAGS+=" -DZLIB" # location of bzip headers and libraries - BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/" - BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a" + BZIP_INCLUDE=" -I $BZIP2_BASE/include/" + BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a" CFLAGS+=" -DBZIP2" - LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/" - LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a" + LZ4_INCLUDE=" -I $LZ4_BASE/include/" + LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a" CFLAGS+=" -DLZ4" - ZSTD_REV=8df2d01673ae6afcc8c8d16fec862b2d67ecc1e9 - ZSTD_INCLUDE=" -I /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/include" - ZSTD_LIBS=" /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/lib/libzstd.a" + ZSTD_INCLUDE=" -I $ZSTD_BASE/include/" + ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a" CFLAGS+=" -DZSTD" fi # location of gflags headers and libraries -GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/" +GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/" if test -z $PIC_BUILD; then - GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a" + GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a" else - GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a" + GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a" fi CFLAGS+=" -DGFLAGS=google" # location of jemalloc -JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/" -JEMALLOC_LIB=" /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a" +JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/" +JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a" if test -z $PIC_BUILD; then # location of numa - NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/" - NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a" + NUMA_INCLUDE=" -I $NUMA_BASE/include/" + NUMA_LIB=" $NUMA_BASE/lib/libnuma.a" CFLAGS+=" -DNUMA" # location of libunwind - LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a" + LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a" fi # use Intel SSE support for checksum calculations export USE_SSE=1 -BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin" +BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" -GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4" STDLIBS="-L $GCC_BASE/lib64" -CLANG_BASE="/mnt/gvfs/third-party2/clang/d81444dd214df3d2466734de45bb264a0486acc3/dev" -CLANG_BIN="$CLANG_BASE/centos6-native/af4b1a0/bin" +CLANG_BIN="$CLANG_BASE/bin" +CLANG_LIB="$CLANG_BASE/lib" +CLANG_SRC="$CLANG_BASE/../../src" + CLANG_ANALYZER="$CLANG_BIN/clang++" -CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build" +CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build" if [ -z "$USE_CLANG" ]; then # gcc CC="$GCC_BASE/bin/gcc" CXX="$GCC_BASE/bin/g++" - + CFLAGS+=" -B$BINUTILS/gold" CFLAGS+=" -isystem $GLIBC_INCLUDE" CFLAGS+=" -isystem $LIBGCC_INCLUDE" + JEMALLOC=1 else - # clang - CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/" + # clang + CLANG_INCLUDE="$CLANG_LIB/clang/stable/include" CC="$CLANG_BIN/clang" CXX="$CLANG_BIN/clang++" - KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include" + KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include" CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib" CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x " @@ -112,11 +114,12 @@ else CFLAGS+=" -isystem $CLANG_INCLUDE" CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux " CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE " + CFLAGS+=" -Wno-expansion-to-defined " CXXFLAGS="-nostdinc++" fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB" @@ -128,6 +131,6 @@ PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS" -VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/" +VALGRIND_VER="$VALGRIND_BASE/bin/" export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD diff --git a/external/rocksdb/build_tools/fbcode_config4.8.1.sh b/external/rocksdb/build_tools/fbcode_config4.8.1.sh index 524a5ed7fa..d338bf6b65 100755 --- a/external/rocksdb/build_tools/fbcode_config4.8.1.sh +++ b/external/rocksdb/build_tools/fbcode_config4.8.1.sh @@ -4,80 +4,78 @@ # fbcode settings. It uses the latest g++ compiler and also # uses jemalloc +BASEDIR=`dirname $BASH_SOURCE` +source "$BASEDIR/dependencies_4.8.1.sh" + # location of libgcc -LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc" LIBGCC_INCLUDE="$LIBGCC_BASE/include" -LIBGCC_LIBS=" -L $LIBGCC_BASE/libs" +LIBGCC_LIBS=" -L $LIBGCC_BASE/lib" # location of glibc -GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa -GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include" -GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib" +GLIBC_INCLUDE="$GLIBC_BASE/include" +GLIBC_LIBS=" -L $GLIBC_BASE/lib" # location of snappy headers and libraries -SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include" -SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a" +SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include" +SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a" # location of zlib headers and libraries -ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include" -ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a" +ZLIB_INCLUDE=" -I $ZLIB_BASE/include" +ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a" # location of bzip headers and libraries -BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/" -BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a" +BZIP2_INCLUDE=" -I $BZIP2_BASE/include/" +BZIP2_LIBS=" $BZIP2_BASE/lib/libbz2.a" -LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b -LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include" -LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a" +LZ4_INCLUDE=" -I $LZ4_BASE/include" +LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a" -ZSTD_REV=8df2d01673ae6afcc8c8d16fec862b2d67ecc1e9 -ZSTD_INCLUDE=" -I /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/include" -ZSTD_LIBS=" /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/lib/libzstd.a" +ZSTD_INCLUDE=" -I $ZSTD_BASE/include" +ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a" # location of gflags headers and libraries -GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/" -GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a" +GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/" +GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a" # location of jemalloc -JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/3691c776ac26dd8781e84f8888b6a0fbdbc0a9ed/dev/gcc-4.8.1-glibc-2.17/4d53c6f/include" -JEMALLOC_LIB="/mnt/gvfs/third-party2/jemalloc/3691c776ac26dd8781e84f8888b6a0fbdbc0a9ed/dev/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a" +JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include" +JEMALLOC_LIB="$JEMALLOC_BASE/lib/libjemalloc.a" # location of numa -NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65 -NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/" -NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a" +NUMA_INCLUDE=" -I $NUMA_BASE/include/" +NUMA_LIB=" $NUMA_BASE/lib/libnuma.a" # location of libunwind -LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc -LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a" +LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a" # use Intel SSE support for checksum calculations export USE_SSE=1 -BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin" +BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" -DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" +DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP2_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" -GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc" STDLIBS="-L $GCC_BASE/lib64" if [ -z "$USE_CLANG" ]; then # gcc CC="$GCC_BASE/bin/gcc" CXX="$GCC_BASE/bin/g++" - + CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic" CFLAGS+=" -isystem $GLIBC_INCLUDE" CFLAGS+=" -isystem $LIBGCC_INCLUDE" + JEMALLOC=1 else - # clang - CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4" - CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include" - CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang" - CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++" + # clang + CLANG_BIN="$CLANG_BASE/bin" + CLANG_LIB="$CLANG_BASE/lib" + CLANG_INCLUDE="$CLANG_LIB/clang/*/include" + CC="$CLANG_BIN/clang" + CXX="$CLANG_BIN/clang++" - KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/" + KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include/" CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib" CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 " @@ -91,20 +89,19 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DZSTD -DNUMA" CXXFLAGS+=" $CFLAGS" -EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB" +EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP2_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB" EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so" EXEC_LDFLAGS+=" $LIBUNWIND" EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib" PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" -EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS" +EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP2_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS" -VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0 -VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/" +VALGRIND_VER="$VALGRIND_BASE/bin/" export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE diff --git a/external/rocksdb/build_tools/make_new_version.sh b/external/rocksdb/build_tools/make_new_version.sh index 409944f833..76a8473557 100755 --- a/external/rocksdb/build_tools/make_new_version.sh +++ b/external/rocksdb/build_tools/make_new_version.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. @@ -10,7 +10,7 @@ then GIT="git" fi -# Print out the colored progress info so that it can be brainlessly +# Print out the colored progress info so that it can be brainlessly # distinguished by users. function title() { echo -e "\033[1;32m$*\033[0m" diff --git a/external/rocksdb/build_tools/regression_build_test.sh b/external/rocksdb/build_tools/regression_build_test.sh index ee2d334f0b..8ac1ceecec 100755 --- a/external/rocksdb/build_tools/regression_build_test.sh +++ b/external/rocksdb/build_tools/regression_build_test.sh @@ -243,7 +243,7 @@ make release --bloom_bits=10 \ --num=$((NUM / 4)) \ --reads=$((NUM / 4)) \ - --writes_per_second=1000 \ + --benchmark_write_rate_limit=$(( 110 * 1024 )) \ --write_buffer_size=100000000 \ --cache_size=6442450944 \ --cache_numshardbits=6 \ @@ -329,7 +329,7 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --use_existing_db=1 \ --duration=600 \ --threads=32 \ - --writes_per_second=81920 > ${STAT_FILE}.readwhilewriting_in_ram + --benchmark_write_rate_limit=9502720 > ${STAT_FILE}.readwhilewriting_in_ram # Seekrandomwhilewriting ./db_bench \ @@ -342,7 +342,7 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --use_tailing_iterator=1 \ --duration=600 \ --threads=32 \ - --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram + --benchmark_write_rate_limit=9502720 > ${STAT_FILE}.seekwhilewriting_in_ram # measure fillseq with bunch of column families ./db_bench \ diff --git a/external/rocksdb/build_tools/rocksdb-lego-determinator b/external/rocksdb/build_tools/rocksdb-lego-determinator index 392231cca4..c3f32820c3 100755 --- a/external/rocksdb/build_tools/rocksdb-lego-determinator +++ b/external/rocksdb/build_tools/rocksdb-lego-determinator @@ -8,7 +8,7 @@ # Input Value # ------------------------------------------------------------------------- # EMAIL Email address to report on trigger conditions -# ONCAL Email address to raise a task on failure +# ONCALL Email address to raise a task on failure # TRIGGER Trigger conditions for email. Valid values are fail, warn, all # SUBSCRIBER Email addresss to add as subscriber for task # @@ -42,6 +42,9 @@ if [ ! -z $ONCALL ]; then }," fi +# For now, create the tasks using only the dedicated task creation tool. +CREATE_TASK= + REPORT= if [[ ! -z $REPORT_EMAIL || ! -z $CREATE_TASK ]]; then REPORT="'report': [ @@ -56,18 +59,58 @@ fi CLEANUP_ENV=" { 'name':'Cleanup environment', - 'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && make clean', + 'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && chmod +t /dev/shm && make clean', 'user':'root' }" +# We will eventually set the RATIO to 1, but we want do this +# in steps. RATIO=$(nproc) will make it work as J=1 +if [ -z $RATIO ]; then + RATIO=$(nproc) +fi + +if [ -z $PARALLEL_J ]; then + PARALLEL_J="J=$(expr $(nproc) / ${RATIO})" +fi + +if [ -z $PARALLEL_j ]; then + PARALLEL_j="-j$(expr $(nproc) / ${RATIO})" +fi + +PARALLELISM="$PARALLEL_J $PARALLEL_j" + DEBUG="OPT=-g" SHM="TEST_TMPDIR=/dev/shm/rocksdb" GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1" ASAN="COMPILE_WITH_ASAN=1" CLANG="USE_CLANG=1" -LITE="OPT=-DROCKSDB_LITE" +LITE="OPT=\"-DROCKSDB_LITE -g\"" TSAN="COMPILE_WITH_TSAN=1" +UBSAN="COMPILE_WITH_UBSAN=1" DISABLE_JEMALLOC="DISABLE_JEMALLOC=1" +HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080" +SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-7u10-64/; export PATH=\$PATH:\$JAVA_HOME/bin" +PARSER="'parser':'python build_tools/error_filter.py $1'" + +CONTRUN_NAME="ROCKSDB_CONTRUN_NAME" + +# This code is getting called under various scenarios. What we care about is to +# understand when it's called from nightly contruns because in that case we'll +# create tasks for any failures. To follow the existing pattern, we'll check +# the value of $ONCALL. If it's a diff then just call `false` to make sure +# that errors will be properly propagated to the caller. +if [ ! -z $ONCALL ]; then + TASK_CREATION_TOOL="/usr/local/bin/mysql_mtr_filter --rocksdb" +else + TASK_CREATION_TOOL="false" +fi + +ARTIFACTS=" 'artifacts': [ + { + 'name':'database', + 'paths':[ '/dev/shm/rocksdb' ], + } +]" # # A mechanism to disable tests temporarily @@ -99,14 +142,16 @@ PARALLEL_UNIT_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug version', - 'shell':'$DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*', - 'user':'root' + 'shell':'$DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || $CONTRUN_NAME=punit_check $TASK_CREATION_TOOL || cat t/log-*', + 'user':'root', + $PARSER }, $CLEANUP_ENV, { 'name':'Build and test RocksDB debug version under gcc-4.8.1', - 'shell':'$GCC_481 $DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*', - 'user':'root' + 'shell':'$GCC_481 $DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || $CONTRUN_NAME=punit_check_gcc481 $TASK_CREATION_TOOL || cat t/log-*', + 'user':'root', + $PARSER }, ], $REPORT @@ -124,8 +169,51 @@ UNIT_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug version', - 'shell':'$SHM $DEBUG make J=1 check', - 'user':'root' + 'shell':'$SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB unit test not under /dev/shm +# +UNIT_TEST_NON_SHM_COMMANDS="[ + { + 'name':'Rocksdb Unit Test', + 'oncall':'$ONCALL', + 'timeout': 86400, + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build and test RocksDB debug version', + 'timeout': 86400, + 'shell':'$DEBUG make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB release build and unit tests +# +RELEASE_BUILD_COMMANDS="[ + { + 'name':'Rocksdb Release Build', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build RocksDB release', + 'shell':'make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -143,8 +231,29 @@ UNIT_TEST_COMMANDS_481="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug version', - 'shell':'$SHM $GCC_481 $DEBUG make J=1 check', - 'user':'root' + 'shell':'$SHM $GCC_481 $DEBUG make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB release build and unit tests +# +RELEASE_BUILD_COMMANDS_481="[ + { + 'name':'Rocksdb Release on GCC 4.8.1', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build RocksDB release on GCC 4.8.1', + 'shell':'$GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -162,8 +271,29 @@ CLANG_UNIT_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug', - 'shell':'$CLANG $SHM $DEBUG make J=1 check', - 'user':'root' + 'shell':'$CLANG $SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB release build with CLANG +# +CLANG_RELEASE_BUILD_COMMANDS="[ + { + 'name':'Rocksdb CLANG Release Build', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build RocksDB release', + 'shell':'$CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -181,8 +311,9 @@ CLANG_ANALYZE_COMMANDS="[ $CLEANUP_ENV, { 'name':'RocksDB build and analyze', - 'shell':'$CLANG $SHM $DEBUG make J=1 analyze', - 'user':'root' + 'shell':'$CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -200,8 +331,9 @@ CODE_COV_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build, test and collect code coverage info', - 'shell':'$SHM $DEBUG make J=1 coverage', - 'user':'root' + 'shell':'$SHM $DEBUG make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -219,8 +351,9 @@ UNITY_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build, test unity test', - 'shell':'$SHM $DEBUG V=1 make J=1 unity_test', - 'user':'root' + 'shell':'$SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -238,8 +371,29 @@ LITE_BUILD_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build RocksDB debug version', - 'shell':'$LITE $DEBUG make J=1 static_lib', - 'user':'root' + 'shell':'$LITE make J=1 static_lib || $CONTRUN_NAME=lite_static_lib $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB lite tests +# +LITE_UNIT_TEST_COMMANDS="[ + { + 'name':'Rocksdb Lite Unit Test', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build RocksDB debug version', + 'shell':'$SHM $LITE make J=1 check || $CONTRUN_NAME=lite_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -258,20 +412,45 @@ STRESS_CRASH_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG make J=1 db_stress', - 'user':'root' + 'shell':'$SHM $DEBUG make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, { 'name':'Build and run RocksDB debug crash tests', 'timeout': 86400, - 'shell':'$SHM $DEBUG make J=1 crash_test', - 'user':'root' + 'shell':'$SHM $DEBUG make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL', + 'user':'root', + $PARSER } ], + $ARTIFACTS, $REPORT } ]" +# RocksDB write stress test. +# We run on disk device on purpose (i.e. no $SHM) +# because we want to add some randomness to fsync commands +WRITE_STRESS_COMMANDS="[ + { + 'name':'Rocksdb Write Stress Test', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build and run RocksDB write stress tests', + 'shell':'make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + } + ], + 'artifacts': [{'name': 'database', 'paths': ['/tmp/rocksdb_write_stress']}], + $REPORT + } +]" + + # # RocksDB test under address sanitizer # @@ -283,8 +462,9 @@ ASAN_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Test RocksDB debug under ASAN', - 'shell':'set -o pipefail && $SHM $ASAN $DEBUG make J=1 asan_check |& /usr/facebook/ops/scripts/asan_symbolize.py -d', - 'user':'root' +'shell':'set -o pipefail && ($SHM $ASAN $DEBUG make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d', + 'user':'root', + $PARSER } ], $REPORT @@ -304,8 +484,51 @@ ASAN_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug asan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG make J=1 asan_crash_test', - 'user':'root' + 'shell':'$SHM $DEBUG make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB test under undefined behavior sanitizer +# +UBSAN_TEST_COMMANDS="[ + { + 'name':'Rocksdb Unit Test under UBSAN', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Test RocksDB debug under UBSAN', + 'shell':'set -o pipefail && $SHM $UBSAN $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + } + ], + $REPORT + } +]" + +# +# RocksDB crash testing under udnefined behavior sanitizer +# +UBSAN_CRASH_TEST_COMMANDS="[ + { + 'name':'Rocksdb crash test under UBSAN', + 'oncall':'$ONCALL', + 'timeout': 86400, + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build and run RocksDB debug ubsan_crash_test', + 'timeout': 86400, + 'shell':'$SHM $DEBUG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -319,12 +542,15 @@ VALGRIND_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under valgrind', 'oncall':'$ONCALL', + 'timeout': 86400, 'steps': [ $CLEANUP_ENV, { 'name':'Run RocksDB debug unit tests', - 'shell':'$DISABLE_JEMALLOC $SHM $DEBUG make valgrind_check', - 'user':'root' + 'timeout': 86400, + 'shell':'$DISABLE_JEMALLOC $SHM $DEBUG make $PARALLELISM valgrind_check || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -338,12 +564,15 @@ TSAN_UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under TSAN', 'oncall':'$ONCALL', + 'timeout': 86400, 'steps': [ $CLEANUP_ENV, { 'name':'Run RocksDB debug unit test', - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 check', - 'user':'root' + 'timeout': 86400, + 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -363,8 +592,9 @@ TSAN_CRASH_TEST_COMMANDS="[ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 crash_test', - 'user':'root' + 'shell':'set -o pipefail && $SHM $DEBUG $TSAN CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -393,9 +623,9 @@ run_format_compatible() if [ -e "build_detect_platform" ] then sed "s/tcmalloc/nothingnothingnothing/g" build_detect_platform > $TEST_TMPDIR/temp_build_file - rm -rf build_detect_platform + rm -rf build_detect_platform cp $TEST_TMPDIR/temp_build_file build_detect_platform - chmod +x build_detect_platform + chmod +x build_detect_platform fi make ldb -j32 @@ -424,8 +654,9 @@ FORMAT_COMPATIBLE_COMMANDS="[ $CLEANUP_ENV, { 'name':'Run RocksDB debug unit test', - 'shell':'build_tools/rocksdb-lego-determinator run_format_compatible', - 'user':'root' + 'shell':'build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -456,8 +687,9 @@ NO_COMPRESSION_COMMANDS="[ $CLEANUP_ENV, { 'name':'Run RocksDB debug unit test', - 'shell':'build_tools/rocksdb-lego-determinator run_no_compression', - 'user':'root' + 'shell':'build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT @@ -512,14 +744,36 @@ REGRESSION_COMMANDS="[ $CLEANUP_ENV, { 'name':'Make and run script', - 'shell':'build_tools/rocksdb-lego-determinator run_regression', - 'user':'root' + 'shell':'build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + +# +# RocksDB Java build +# +JAVA_BUILD_TEST_COMMANDS="[ + { + 'name':'Rocksdb Java Build', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build RocksDB for Java', + 'shell':'$SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL', + 'user':'root', + $PARSER }, ], $REPORT } ]" + case $1 in punit) echo $PARALLEL_UNIT_TEST_COMMANDS @@ -527,12 +781,24 @@ case $1 in unit) echo $UNIT_TEST_COMMANDS ;; + unit_non_shm) + echo $UNIT_TEST_NON_SHM_COMMANDS + ;; + release) + echo $RELEASE_BUILD_COMMANDS + ;; unit_481) echo $UNIT_TEST_COMMANDS_481 ;; + release_481) + echo $RELEASE_BUILD_COMMANDS_481 + ;; clang_unit) echo $CLANG_UNIT_TEST_COMMANDS ;; + clang_release) + echo $CLANG_RELEASE_BUILD_COMMANDS + ;; clang_analyze) echo $CLANG_ANALYZE_COMMANDS ;; @@ -545,15 +811,27 @@ case $1 in lite) echo $LITE_BUILD_COMMANDS ;; + lite_test) + echo $LITE_UNIT_TEST_COMMANDS + ;; stress_crash) echo $STRESS_CRASH_TEST_COMMANDS ;; + write_stress) + echo $WRITE_STRESS_COMMANDS + ;; asan) echo $ASAN_TEST_COMMANDS ;; asan_crash) echo $ASAN_CRASH_TEST_COMMANDS ;; + ubsan) + echo $UBSAN_TEST_COMMANDS + ;; + ubsan_crash) + echo $UBSAN_CRASH_TEST_COMMANDS + ;; valgrind) echo $VALGRIND_TEST_COMMANDS ;; @@ -581,6 +859,9 @@ case $1 in run_regression) run_regression ;; + java_build) + echo $JAVA_BUILD_TEST_COMMANDS + ;; *) echo "Invalid determinator command" ;; diff --git a/external/rocksdb/build_tools/run_ci_db_test.ps1 b/external/rocksdb/build_tools/run_ci_db_test.ps1 index 5f47f3d874..94d81cc24c 100755 --- a/external/rocksdb/build_tools/run_ci_db_test.ps1 +++ b/external/rocksdb/build_tools/run_ci_db_test.ps1 @@ -1,14 +1,19 @@ # This script enables you running RocksDB tests by running -# All the tests in paralell and utilizing all the cores +# All the tests in parallel and utilizing all the cores # For db_test the script first lists and parses the tests # and then fires them up in parallel using async PS Job functionality # Run the script from the enlistment Param( [switch]$EnableJE = $false, # Use je executable + [switch]$EnableRerun = $false, # Rerun failed tests sequentially at the end [string]$WorkFolder = "", # Direct tests to use that folder [int]$Limit = -1, # -1 means run all otherwise limit for testing purposes [string]$Exclude = "", # Expect a comma separated list, no spaces - [string]$Run = "db_test" # Run db_test|tests + [string]$Run = "db_test", # Run db_test|db_test2|tests|testname1,testname2... + # Number of async tasks that would run concurrently. Recommend a number below 64. + # However, CPU utlization really depends on the storage media. Recommend ram based disk. + # a value of 1 will run everything serially + [int]$Concurrency = 16 ) # Folders and commands must be fullpath to run assuming @@ -41,15 +46,25 @@ if($WorkFolder -eq "") { $Env:TEST_TMPDIR = $WorkFolder } +Write-Output "Root: $RootFolder, WorkFolder: $WorkFolder" + # Use JEMALLOC executables -if($EnableJE) { - $db_test = -Join ($BinariesFolder, "db_test_je.exe") -} else { - $db_test = -Join ($BinariesFolder, "db_test.exe") +if($Run -ceq "db_test" -or + $Run -ceq "db_test2" ) { + + $file_name = $Run + + if($EnableJE) { + $file_name += "_je" + } + + $file_name += ".exe" + + $db_test = -Join ($BinariesFolder, $file_name) + + Write-Output "Binaries: $BinariesFolder db_test: $db_test" } -Write-Output "Root: $RootFolder, WorkFolder: $WorkFolder" -Write-Output "Binaries: $BinariesFolder exe: $db_test" #Exclusions that we do not want to run $ExcludeTests = New-Object System.Collections.Generic.HashSet[string] @@ -57,7 +72,7 @@ $ExcludeTests = New-Object System.Collections.Generic.HashSet[string] if($Exclude -ne "") { Write-Host "Exclude: $Exclude" - $l = $Exclude -split ',' + $l = $Exclude -split ' ' ForEach($t in $l) { $ExcludeTests.Add($t) | Out-Null } } @@ -109,38 +124,70 @@ function Normalize-DbTests($HashTable) { $test_log = $test -replace '[\./]','_' $test_log += ".log" + $log_path = -join ($LogFolder, $test_log) # Add to a hashtable - $HashTable.Add($test, $test_log); + $HashTable.Add($test, $log_path); } } } +# The function removes trailing .exe siffix if any, +# creates a name for the log file +function MakeAndAdd([string]$token, $HashTable) { + $test_name = $token -replace '.exe$', '' + $log_name = -join ($test_name, ".log") + $log_path = -join ($LogFolder, $log_name) + if(!$ExcludeTests.Contains($test_name)) { + $HashTable.Add($test_name, $log_path) + } else { + Write-Warning "Test $test_name is excluded" + } +} + # The function scans build\Debug folder to discover # Test executables. It then populates a table with # Test executable name -> Log file -function Discover-TestBinaries($HashTable) { +function Discover-TestBinaries([string]$Pattern, $HashTable) { $Exclusions = @("db_test*", "db_sanity_test*") - $p = -join ($BinariesFolder, "*_test*.exe") + + $p = -join ($BinariesFolder, $pattern) + + Write-Host "Path: $p" dir -Path $p -Exclude $Exclusions | ForEach-Object { - $t = ($_.Name) -replace '.exe$', '' - $test_log = -join ($t, ".log") - $HashTable.Add($t, $test_log) + MakeAndAdd -token ($_.Name) -HashTable $HashTable } } -$TestToLog = [ordered]@{} +$TestsToRun = [ordered]@{} -if($Run -ceq "db_test") { - Normalize-DbTests -HashTable $TestToLog +if($Run -ceq "db_test" -or + $Run -ceq "db_test2") { + Normalize-DbTests -HashTable $TestsToRun } elseif($Run -ceq "tests") { - Discover-TestBinaries -HashTable $TestToLog + if($EnableJE) { + $pattern = "*_test_je.exe" + } else { + $pattern = "*_test.exe" + } + Discover-TestBinaries -Pattern $pattern -HashTable $TestsToRun +} else { + + $test_list = $Run -split ' ' + + ForEach($t in $test_list) { + MakeAndAdd -token $t -HashTable $TestsToRun + } } +$NumTestsToStart = $TestsToRun.Count +if($Limit -ge 0 -and $NumTestsToStart -gt $Limit) { + $NumTestsToStart = $Limit +} -Write-Host "Attempting to start: " ($TestToLog.Count) " tests" +Write-Host "Attempting to start: $NumTestsToStart tests" # Invoke a test with a filter and redirect all output $InvokeTestCase = { @@ -154,93 +201,124 @@ $InvokeTestAsync = { &$exe > $log 2>&1 } -$jobs = @() -$JobToLog = @{} +# Hash that contains tests to rerun if any failed +# Those tests will be rerun sequentially +$Rerun = [ordered]@{} # Test limiting factor here $count = 0 +# Overall status +[bool]$success = $true; -ForEach($k in $TestToLog.keys) { +function RunJobs($TestToLog, [int]$ConcurrencyVal, [bool]$AddForRerun) +{ + # Array to wait for any of the running jobs + $jobs = @() + # Hash JobToLog + $JobToLog = @{} - Write-Host "Starting $k" - $log_path = -join ($LogFolder, ($TestToLog.$k)) + # Wait for all to finish and get the results + while(($JobToLog.Count -gt 0) -or + ($TestToLog.Count -gt 0)) { - if($Run -ceq "db_test") { - $job = Start-Job -Name $k -ScriptBlock $InvokeTestCase -ArgumentList @($db_test,$k,$log_path) - } else { - [string]$Exe = -Join ($BinariesFolder, $k) - $job = Start-Job -Name $k -ScriptBlock $InvokeTestAsync -ArgumentList @($exe,$log_path) - } + # Make sure we have maximum concurrent jobs running if anything + # and the $Limit either not set or allows to proceed + while(($JobToLog.Count -lt $ConcurrencyVal) -and + (($TestToLog.Count -gt 0) -and + (($Limit -lt 0) -or ($count -lt $Limit)))) { - $JobToLog.Add($job, $log_path) - # Limiting trial runs - if(($Limit -gt 0) -and (++$count -ge $Limit)) { - break - } -} + # We only need the first key + foreach($key in $TestToLog.keys) { + $k = $key + break + } -[bool]$success = $true; + Write-Host "Starting $k" + $log_path = ($TestToLog.$k) -# Wait for all to finish and get the results -while($JobToLog.Count -gt 0) { + if($Run -ceq "db_test" -or + $Run -ceq "db_test2") { + $job = Start-Job -Name $k -ScriptBlock $InvokeTestCase -ArgumentList @($db_test,$k,$log_path) + } else { + [string]$Exe = -Join ($BinariesFolder, $k) + $job = Start-Job -Name $k -ScriptBlock $InvokeTestAsync -ArgumentList @($exe,$log_path) + } - $jobs = @() - foreach($k in $JobToLog.Keys) { $jobs += $k } + $JobToLog.Add($job, $log_path) + $TestToLog.Remove($k) -<# - if(!$success) { - break - } -#> + ++$count + } - $completed = Wait-Job -Job $jobs -Any - $log = $JobToLog[$completed] - $JobToLog.Remove($completed) + if($JobToLog.Count -lt 1) { + break + } - $message = -join @($completed.Name, " State: ", ($completed.State)) + $jobs = @() + foreach($k in $JobToLog.Keys) { $jobs += $k } - $log_content = @(Get-Content $log) + $completed = Wait-Job -Job $jobs -Any + $log = $JobToLog[$completed] + $JobToLog.Remove($completed) - if($completed.State -ne "Completed") { - $success = $false - Write-Warning $message - $log_content | Write-Warning - } else { - # Scan the log. If we find PASSED and no occurence of FAILED - # then it is a success - [bool]$pass_found = $false - ForEach($l in $log_content) { - - if(($l -match "^\[\s+FAILED") -or - ($l -match "Assertion failed:")) { - $pass_found = $false - break - } + $message = -join @($completed.Name, " State: ", ($completed.State)) - if(($l -match "^\[\s+PASSED") -or - ($l -match " : PASSED$") -or - ($l -match "^PASSED") -or - ($l -match "Passed all tests!") ) { - $pass_found = $true - } - } + $log_content = @(Get-Content $log) - if(!$pass_found) { - $success = $false; + if($completed.State -ne "Completed") { + $success = $false Write-Warning $message $log_content | Write-Warning } else { - Write-Host $message + # Scan the log. If we find PASSED and no occurrence of FAILED + # then it is a success + [bool]$pass_found = $false + ForEach($l in $log_content) { + + if(($l -match "^\[\s+FAILED") -or + ($l -match "Assertion failed:")) { + $pass_found = $false + break + } + + if(($l -match "^\[\s+PASSED") -or + ($l -match " : PASSED$") -or + ($l -match "^PASS$") -or # Special c_test case + ($l -match "Passed all tests!") ) { + $pass_found = $true + } + } + + if(!$pass_found) { + $success = $false; + Write-Warning $message + $log_content | Write-Warning + if($AddForRerun) { + $Rerun.Add($completed.Name, $log) + } + } else { + Write-Host $message + } } + + # Remove cached job info from the system + # Should be no output + Receive-Job -Job $completed | Out-Null } +} + +RunJobs -TestToLog $TestsToRun -ConcurrencyVal $Concurrency -AddForRerun $EnableRerun - # Remove cached job info from the system - # Should be no output - Receive-Job -Job $completed | Out-Null +if($Rerun.Count -gt 0) { + Write-Host "Rerunning " ($Rerun.Count) " tests sequentially" + $success = $true + $count = 0 + RunJobs -TestToLog $Rerun -ConcurrencyVal 1 -AddForRerun $false } Get-Date + if(!$success) { # This does not succeed killing off jobs quick # So we simply exit @@ -249,4 +327,4 @@ if(!$success) { exit 12345 } - \ No newline at end of file + diff --git a/external/rocksdb/util/auto_roll_logger.cc b/external/rocksdb/db/auto_roll_logger.cc similarity index 73% rename from external/rocksdb/util/auto_roll_logger.cc rename to external/rocksdb/db/auto_roll_logger.cc index 4ea0356796..0fb2a1d2a4 100644 --- a/external/rocksdb/util/auto_roll_logger.cc +++ b/external/rocksdb/db/auto_roll_logger.cc @@ -1,18 +1,18 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "util/auto_roll_logger.h" +#include "db/auto_roll_logger.h" #include "util/mutexlock.h" -using namespace std; - namespace rocksdb { // -- AutoRollLogger Status AutoRollLogger::ResetLogger() { + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); status_ = env_->NewLogger(log_fname_, &logger_); + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger"); if (!status_.ok()) { return status_; @@ -32,12 +32,21 @@ Status AutoRollLogger::ResetLogger() { } void AutoRollLogger::RollLogFile() { - std::string old_fname = OldInfoLogFileName( - dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_); + // This function is called when log is rotating. Two rotations + // can happen quickly (NowMicro returns same value). To not overwrite + // previous log file we increment by one micro second and try again. + uint64_t now = env_->NowMicros(); + std::string old_fname; + do { + old_fname = OldInfoLogFileName( + dbname_, now, db_absolute_path_, db_log_dir_); + now++; + } while (env_->FileExists(old_fname).ok()); env_->RenameFile(log_fname_, old_fname); } -string AutoRollLogger::ValistToString(const char* format, va_list args) const { +std::string AutoRollLogger::ValistToString(const char* format, + va_list args) const { // Any log messages longer than 1024 will get truncated. // The user is responsible for chopping longer messages into multi line log static const int MAXBUFFERSIZE = 1024; @@ -102,7 +111,7 @@ void AutoRollLogger::LogHeader(const char* format, va_list args) { // strings va_list tmp; va_copy(tmp, args); - string data = ValistToString(format, tmp); + std::string data = ValistToString(format, tmp); va_end(tmp); MutexLock l(&mutex_); @@ -122,22 +131,25 @@ bool AutoRollLogger::LogExpired() { return cached_now >= ctime_ + kLogFileTimeToRoll; } -Status CreateLoggerFromOptions( - const std::string& dbname, - const std::string& db_log_dir, - Env* env, - const DBOptions& options, - std::shared_ptr* logger) { +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger) { + if (options.info_log) { + *logger = options.info_log; + return Status::OK(); + } + + Env* env = options.env; std::string db_absolute_path; env->GetAbsolutePath(dbname, &db_absolute_path); - std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir); + std::string fname = + InfoLogFileName(dbname, db_absolute_path, options.db_log_dir); env->CreateDirIfMissing(dbname); // In case it does not exist // Currently we only support roll by time-to-roll and log size if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( - env, dbname, db_log_dir, - options.max_log_file_size, + env, dbname, options.db_log_dir, options.max_log_file_size, options.log_file_time_to_roll, options.info_log_level); Status s = result->GetStatus(); if (!s.ok()) { @@ -148,8 +160,9 @@ Status CreateLoggerFromOptions( return s; } else { // Open a log file in the same directory as the db - env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(), - db_absolute_path, db_log_dir)); + env->RenameFile( + fname, OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path, + options.db_log_dir)); auto s = env->NewLogger(fname, logger); if (logger->get() != nullptr) { (*logger)->SetInfoLogLevel(options.info_log_level); diff --git a/external/rocksdb/util/auto_roll_logger.h b/external/rocksdb/db/auto_roll_logger.h similarity index 77% rename from external/rocksdb/util/auto_roll_logger.h rename to external/rocksdb/db/auto_roll_logger.h index 5b6dff6ae5..a43a98a8f1 100644 --- a/external/rocksdb/util/auto_roll_logger.h +++ b/external/rocksdb/db/auto_roll_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,10 +8,13 @@ #pragma once #include +#include #include "db/filename.h" #include "port/port.h" #include "port/util_logger.h" +#include "util/sync_point.h" +#include "util/mutexlock.h" namespace rocksdb { @@ -52,11 +55,26 @@ class AutoRollLogger : public Logger { return status_; } - size_t GetLogFileSize() const override { return logger_->GetLogFileSize(); } + size_t GetLogFileSize() const override { + std::shared_ptr logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + return logger->GetLogFileSize(); + } void Flush() override { - if (logger_) { - logger_->Flush(); + std::shared_ptr logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + TEST_SYNC_POINT("AutoRollLogger::Flush:PinnedLogger"); + if (logger) { + logger->Flush(); } } @@ -100,15 +118,12 @@ class AutoRollLogger : public Logger { uint64_t ctime_; uint64_t cached_now_access_count; uint64_t call_NowMicros_every_N_records_; - port::Mutex mutex_; + mutable port::Mutex mutex_; }; // Facade to craete logger automatically -Status CreateLoggerFromOptions( - const std::string& dbname, - const std::string& db_log_dir, - Env* env, - const DBOptions& options, - std::shared_ptr* logger); +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger); } // namespace rocksdb diff --git a/external/rocksdb/util/auto_roll_logger_test.cc b/external/rocksdb/db/auto_roll_logger_test.cc similarity index 75% rename from external/rocksdb/util/auto_roll_logger_test.cc rename to external/rocksdb/db/auto_roll_logger_test.cc index 138eb6eb47..6a0c954613 100644 --- a/external/rocksdb/util/auto_roll_logger_test.cc +++ b/external/rocksdb/db/auto_roll_logger_test.cc @@ -1,23 +1,24 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #include +#include #include #include #include #include #include #include +#include "db/auto_roll_logger.h" +#include "port/port.h" +#include "util/sync_point.h" #include "util/testharness.h" -#include "util/auto_roll_logger.h" #include "rocksdb/db.h" #include #include -using namespace std; - namespace rocksdb { class AutoRollLoggerTest : public testing::Test { @@ -37,23 +38,22 @@ class AutoRollLoggerTest : public testing::Test { Env::Default()->CreateDir(kTestDir); } - void RollLogFileBySizeTest(AutoRollLogger* logger, - size_t log_max_size, - const string& log_message); - uint64_t RollLogFileByTimeTest(AutoRollLogger* logger, - size_t time, - const string& log_message); + void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, + const std::string& log_message); + uint64_t RollLogFileByTimeTest(AutoRollLogger* logger, size_t time, + const std::string& log_message); - static const string kSampleMessage; - static const string kTestDir; - static const string kLogFile; + static const std::string kSampleMessage; + static const std::string kTestDir; + static const std::string kLogFile; static Env* env; }; -const string AutoRollLoggerTest::kSampleMessage( +const std::string AutoRollLoggerTest::kSampleMessage( "this is the message to be written to the log file!!"); -const string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test"); -const string AutoRollLoggerTest::kLogFile(test::TmpDir() + "/db_log_test/LOG"); +const std::string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test"); +const std::string AutoRollLoggerTest::kLogFile(test::TmpDir() + + "/db_log_test/LOG"); Env* AutoRollLoggerTest::env = Env::Default(); // In this test we only want to Log some simple log message with @@ -83,7 +83,7 @@ void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) { void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, - const string& log_message) { + const std::string& log_message) { logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); // measure the size of each message, which is supposed // to be equal or greater than log_message.size() @@ -108,7 +108,7 @@ void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, } uint64_t AutoRollLoggerTest::RollLogFileByTimeTest( - AutoRollLogger* logger, size_t time, const string& log_message) { + AutoRollLogger* logger, size_t time, const std::string& log_message) { uint64_t expected_create_time; uint64_t actual_create_time; uint64_t total_log_size; @@ -221,13 +221,13 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { shared_ptr logger; // Normal logger - ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); ASSERT_TRUE(dynamic_cast(logger.get())); // Only roll by size InitTestDb(); options.max_log_file_size = 1024; - ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); AutoRollLogger* auto_roll_logger = dynamic_cast(logger.get()); ASSERT_TRUE(auto_roll_logger); @@ -239,7 +239,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { InitTestDb(); options.max_log_file_size = 0; options.log_file_time_to_roll = 2; - ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); auto_roll_logger = dynamic_cast(logger.get()); RollLogFileByTimeTest( @@ -250,7 +250,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { InitTestDb(); options.max_log_file_size = 1024 * 5; options.log_file_time_to_roll = 2; - ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); auto_roll_logger = dynamic_cast(logger.get()); RollLogFileBySizeTest( @@ -260,7 +260,49 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { auto_roll_logger, options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - both"); } -#endif + +TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { + DBOptions options; + shared_ptr logger; + + InitTestDb(); + options.max_log_file_size = 1024 * 5; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast(logger.get()); + ASSERT_TRUE(auto_roll_logger); + std::thread flush_thread; + + // Notes: + // (1) Need to pin the old logger before beginning the roll, as rolling grabs + // the mutex, which would prevent us from accessing the old logger. This + // also marks flush_thread with AutoRollLogger::Flush:PinnedLogger. + // (2) Need to reset logger during PosixLogger::Flush() to exercise a race + // condition case, which is executing the flush with the pinned (old) + // logger after auto-roll logger has cut over to a new logger. + // (3) PosixLogger::Flush() happens in both threads but its SyncPoints only + // are enabled in flush_thread (the one pinning the old logger). + rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + {{"AutoRollLogger::Flush:PinnedLogger", + "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"}, + {"PosixLogger::Flush:Begin1", + "AutoRollLogger::ResetLogger:BeforeNewLogger"}, + {"AutoRollLogger::ResetLogger:AfterNewLogger", + "PosixLogger::Flush:Begin2"}}, + {{"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin1"}, + {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + flush_thread = std::thread([&]() { auto_roll_logger->Flush(); }); + TEST_SYNC_POINT( + "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"); + RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":LogFlushWhileRolling"); + flush_thread.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // OS_WIN TEST_F(AutoRollLoggerTest, InfoLogLevel) { InitTestDb(); @@ -305,13 +347,13 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) { // Test the logger Header function for roll over logs // We expect the new logs creates as roll over to carry the headers specified -static std::vector GetOldFileNames(const string& path) { - std::vector ret; +static std::vector GetOldFileNames(const std::string& path) { + std::vector ret; - const string dirname = path.substr(/*start=*/ 0, path.find_last_of("/")); - const string fname = path.substr(path.find_last_of("/") + 1); + const std::string dirname = path.substr(/*start=*/0, path.find_last_of("/")); + const std::string fname = path.substr(path.find_last_of("/") + 1); - std::vector children; + std::vector children; Env::Default()->GetChildren(dirname, &children); // We know that the old log files are named [path] @@ -326,12 +368,13 @@ static std::vector GetOldFileNames(const string& path) { } // Return the number of lines where a given pattern was found in the file -static size_t GetLinesCount(const string& fname, const string& pattern) { - stringstream ssbuf; - string line; +static size_t GetLinesCount(const std::string& fname, + const std::string& pattern) { + std::stringstream ssbuf; + std::string line; size_t count = 0; - ifstream inFile(fname.c_str()); + std::ifstream inFile(fname.c_str()); ssbuf << inFile.rdbuf(); while (getline(ssbuf, line)) { @@ -370,7 +413,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { } } - const string newfname = logger.TEST_log_fname(); + const std::string newfname = logger.TEST_log_fname(); // Log enough data to cause a roll over int i = 0; @@ -402,7 +445,16 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { TEST_F(AutoRollLoggerTest, LogFileExistence) { rocksdb::DB* db; rocksdb::Options options; - string deleteCmd = "rm -rf " + kTestDir; +#ifdef OS_WIN + // Replace all slashes in the path so windows CompSpec does not + // become confused + std::string testDir(kTestDir); + std::replace_if(testDir.begin(), testDir.end(), + [](char ch) { return ch == '/'; }, '\\'); + std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir; +#else + std::string deleteCmd = "rm -rf " + kTestDir; +#endif ASSERT_EQ(system(deleteCmd.c_str()), 0); options.max_log_file_size = 100 * 1024 * 1024; options.create_if_missing = true; diff --git a/external/rocksdb/db/builder.cc b/external/rocksdb/db/builder.cc index 3d07a0f30e..ab255e493d 100644 --- a/external/rocksdb/db/builder.cc +++ b/external/rocksdb/db/builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,6 +15,7 @@ #include "db/compaction_iterator.h" #include "db/dbformat.h" +#include "db/event_helpers.h" #include "db/filename.h" #include "db/internal_stats.h" #include "db/merge_helper.h" @@ -26,6 +27,7 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based_table_builder.h" +#include "table/internal_iterator.h" #include "util/file_reader_writer.h" #include "util/iostats_context_imp.h" #include "util/stop_watch.h" @@ -40,25 +42,39 @@ TableBuilder* NewTableBuilder( const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, + uint32_t column_family_id, const std::string& column_family_name, WritableFileWriter* file, const CompressionType compression_type, - const CompressionOptions& compression_opts, const bool skip_filters) { + const CompressionOptions& compression_opts, + const std::string* compression_dict, const bool skip_filters) { + assert((column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + column_family_name.empty()); return ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, internal_comparator, int_tbl_prop_collector_factories, compression_type, - compression_opts, skip_filters), - file); + compression_opts, compression_dict, skip_filters, + column_family_name), + column_family_id, file); } Status BuildTable( const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, - FileMetaData* meta, const InternalKeyComparator& internal_comparator, + const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options, + TableCache* table_cache, InternalIterator* iter, FileMetaData* meta, + const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - std::vector snapshots, const CompressionType compression, + uint32_t column_family_id, const std::string& column_family_name, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const CompressionType compression, const CompressionOptions& compression_opts, bool paranoid_file_checks, - InternalStats* internal_stats, const Env::IOPriority io_priority, - TableProperties* table_properties) { + InternalStats* internal_stats, TableFileCreationReason reason, + EventLogger* event_logger, int job_id, const Env::IOPriority io_priority, + TableProperties* table_properties, int level) { + assert((column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + column_family_name.empty()); // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; Status s; @@ -67,13 +83,22 @@ Status BuildTable( std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + ioptions.listeners, dbname, column_family_name, fname, job_id, reason); +#endif // !ROCKSDB_LITE + TableProperties tp; + if (iter->Valid()) { TableBuilder* builder; unique_ptr file_writer; { unique_ptr file; - s = env->NewWritableFile(fname, &file, env_options); + s = NewWritableFile(env, fname, &file, env_options); if (!s.ok()) { + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, column_family_name, fname, + job_id, meta->fd, tp, reason, s); return s; } file->SetIOPriority(io_priority); @@ -82,17 +107,19 @@ Status BuildTable( builder = NewTableBuilder( ioptions, internal_comparator, int_tbl_prop_collector_factories, - file_writer.get(), compression, compression_opts); + column_family_id, column_family_name, file_writer.get(), compression, + compression_opts); } MergeHelper merge(env, internal_comparator.user_comparator(), ioptions.merge_operator, nullptr, ioptions.info_log, - ioptions.min_partial_merge_operands, + mutable_cf_options.min_partial_merge_operands, true /* internal key corruption is not ok */, snapshots.empty() ? 0 : snapshots.back()); CompactionIterator c_iter(iter, internal_comparator.user_comparator(), - &merge, kMaxSequenceNumber, &snapshots, env, + &merge, kMaxSequenceNumber, &snapshots, + earliest_write_conflict_snapshot, env, true /* internal key corruption is not ok */); c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { @@ -119,11 +146,13 @@ Status BuildTable( } if (s.ok() && !empty) { - meta->fd.file_size = builder->FileSize(); + uint64_t file_size = builder->FileSize(); + meta->fd.file_size = file_size; meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); + tp = builder->GetTableProperties(); if (table_properties) { - *table_properties = builder->GetTableProperties(); + *table_properties = tp; } } delete builder; @@ -139,11 +168,12 @@ Status BuildTable( if (s.ok() && !empty) { // Verify that the table is usable - std::unique_ptr it(table_cache->NewIterator( + std::unique_ptr it(table_cache->NewIterator( ReadOptions(), env_options, internal_comparator, meta->fd, nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), - false)); + false /* for_compaction */, nullptr /* arena */, + false /* skip_filter */, level)); s = it->status(); if (s.ok() && paranoid_file_checks) { for (it->SeekToFirst(); it->Valid(); it->Next()) { @@ -161,6 +191,12 @@ Status BuildTable( if (!s.ok() || meta->fd.GetFileSize() == 0) { env->DeleteFile(fname); } + + // Output to event logger and fire events. + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, column_family_name, fname, + job_id, meta->fd, tp, reason, s); + return s; } diff --git a/external/rocksdb/db/builder.h b/external/rocksdb/db/builder.h index 09d81bfe4f..62aa717c66 100644 --- a/external/rocksdb/db/builder.h +++ b/external/rocksdb/db/builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,11 +12,14 @@ #include "db/table_properties_collector.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" -#include "rocksdb/status.h" -#include "rocksdb/types.h" -#include "rocksdb/options.h" #include "rocksdb/immutable_options.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" #include "rocksdb/table_properties.h" +#include "rocksdb/types.h" +#include "util/event_logger.h" +#include "util/mutable_cf_options.h" namespace rocksdb { @@ -31,14 +34,22 @@ class VersionEdit; class TableBuilder; class WritableFileWriter; class InternalStats; +class InternalIterator; +// @param column_family_name Name of the column family that is also identified +// by column_family_id, or empty string if unknown. It must outlive the +// TableBuilder returned by this function. +// @param compression_dict Data for presetting the compression library's +// dictionary, or nullptr. TableBuilder* NewTableBuilder( const ImmutableCFOptions& options, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, + uint32_t column_family_id, const std::string& column_family_name, WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, + const std::string* compression_dict = nullptr, const bool skip_filters = false); // Build a Table file from the contents of *iter. The generated file @@ -46,16 +57,24 @@ TableBuilder* NewTableBuilder( // *meta will be filled with metadata about the generated table. // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. +// +// @param column_family_name Name of the column family that is also identified +// by column_family_id, or empty string if unknown. extern Status BuildTable( const std::string& dbname, Env* env, const ImmutableCFOptions& options, - const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, - FileMetaData* meta, const InternalKeyComparator& internal_comparator, + const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options, + TableCache* table_cache, InternalIterator* iter, FileMetaData* meta, + const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - std::vector snapshots, const CompressionType compression, + uint32_t column_family_id, const std::string& column_family_name, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const CompressionType compression, const CompressionOptions& compression_opts, bool paranoid_file_checks, - InternalStats* internal_stats, + InternalStats* internal_stats, TableFileCreationReason reason, + EventLogger* event_logger = nullptr, int job_id = 0, const Env::IOPriority io_priority = Env::IO_HIGH, - TableProperties* table_properties = nullptr); + TableProperties* table_properties = nullptr, int level = -1); } // namespace rocksdb diff --git a/external/rocksdb/db/c.cc b/external/rocksdb/db/c.cc index 8cd08265e9..05802aa21d 100644 --- a/external/rocksdb/db/c.cc +++ b/external/rocksdb/db/c.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -269,33 +269,31 @@ struct rocksdb_mergeoperator_t : public MergeOperator { virtual const char* Name() const override { return (*name_)(state_); } - virtual bool FullMerge(const Slice& key, const Slice* existing_value, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override { - size_t n = operand_list.size(); + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + size_t n = merge_in.operand_list.size(); std::vector operand_pointers(n); std::vector operand_sizes(n); for (size_t i = 0; i < n; i++) { - Slice operand(operand_list[i]); + Slice operand(merge_in.operand_list[i]); operand_pointers[i] = operand.data(); operand_sizes[i] = operand.size(); } const char* existing_value_data = nullptr; size_t existing_value_len = 0; - if (existing_value != nullptr) { - existing_value_data = existing_value->data(); - existing_value_len = existing_value->size(); + if (merge_in.existing_value != nullptr) { + existing_value_data = merge_in.existing_value->data(); + existing_value_len = merge_in.existing_value->size(); } unsigned char success; size_t new_value_len; char* tmp_new_value = (*full_merge_)( - state_, key.data(), key.size(), existing_value_data, existing_value_len, - &operand_pointers[0], &operand_sizes[0], static_cast(n), &success, - &new_value_len); - new_value->assign(tmp_new_value, new_value_len); + state_, merge_in.key.data(), merge_in.key.size(), existing_value_data, + existing_value_len, &operand_pointers[0], &operand_sizes[0], + static_cast(n), &success, &new_value_len); + merge_out->new_value.assign(tmp_new_value, new_value_len); if (delete_value_ != nullptr) { (*delete_value_)(state_, tmp_new_value, new_value_len); @@ -446,6 +444,12 @@ void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, SaveError(errptr, be->rep->CreateNewBackup(db->rep)); } +void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be, + uint32_t num_backups_to_keep, + char** errptr) { + SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep)); +} + rocksdb_restore_options_t* rocksdb_restore_options_create() { return new rocksdb_restore_options_t; } @@ -825,6 +829,31 @@ rocksdb_iterator_t* rocksdb_create_iterator_cf( return result; } +void rocksdb_create_iterators( + rocksdb_t *db, + rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, + size_t size, + char** errptr) { + std::vector column_families_vec; + for (size_t i = 0; i < size; i++) { + column_families_vec.push_back(column_families[i]->rep); + } + + std::vector res; + Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res); + assert(res.size() == size); + if (SaveError(errptr, status)) { + return; + } + + for (size_t i = 0; i < size; i++) { + iterators[i] = new rocksdb_iterator_t; + iterators[i]->rep = res[i]; + } +} + const rocksdb_snapshot_t* rocksdb_create_snapshot( rocksdb_t* db) { rocksdb_snapshot_t* result = new rocksdb_snapshot_t; @@ -1288,6 +1317,16 @@ void rocksdb_block_based_options_set_cache_index_and_filter_blocks( options->rep.cache_index_and_filter_blocks = v; } +void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_l0_filter_and_index_blocks_in_cache = v; +} + +void rocksdb_block_based_options_set_skip_table_builder_flush( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.skip_table_builder_flush = v; +} + void rocksdb_options_set_block_based_table_factory( rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options) { @@ -1383,6 +1422,11 @@ void rocksdb_options_set_compaction_filter_factory( std::shared_ptr(factory); } +void rocksdb_options_compaction_readahead_size( + rocksdb_options_t* opt, size_t s) { + opt->rep.compaction_readahead_size = s; +} + void rocksdb_options_set_comparator( rocksdb_options_t* opt, rocksdb_comparator_t* cmp) { @@ -1526,11 +1570,13 @@ void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, } } -void rocksdb_options_set_compression_options( - rocksdb_options_t* opt, int w_bits, int level, int strategy) { +void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits, + int level, int strategy, + int max_dict_bytes) { opt->rep.compression_opts.window_bits = w_bits; opt->rep.compression_opts.level = level; opt->rep.compression_opts.strategy = strategy; + opt->rep.compression_opts.max_dict_bytes = max_dict_bytes; } void rocksdb_options_set_prefix_extractor( @@ -1644,11 +1690,6 @@ void rocksdb_options_set_verify_checksums_in_compaction( opt->rep.verify_checksums_in_compaction = v; } -void rocksdb_options_set_filter_deletes( - rocksdb_options_t* opt, unsigned char v) { - opt->rep.filter_deletes = v; -} - void rocksdb_options_set_max_sequential_skip_in_iterations( rocksdb_options_t* opt, uint64_t v) { opt->rep.max_sequential_skip_in_iterations = v; @@ -1687,6 +1728,11 @@ void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { opt->rep.keep_log_file_num = v; } +void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt, + size_t v) { + opt->rep.recycle_log_file_num = v; +} + void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) { opt->rep.soft_rate_limit = v; } @@ -1746,14 +1792,14 @@ void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { opt->rep.memtable_factory.reset(factory); } -void rocksdb_options_set_memtable_prefix_bloom_bits( - rocksdb_options_t* opt, uint32_t v) { - opt->rep.memtable_prefix_bloom_bits = v; +void rocksdb_options_set_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt, double v) { + opt->rep.memtable_prefix_bloom_size_ratio = v; } -void rocksdb_options_set_memtable_prefix_bloom_probes( - rocksdb_options_t* opt, uint32_t v) { - opt->rep.memtable_prefix_bloom_probes = v; +void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.memtable_huge_page_size = v; } void rocksdb_options_set_hash_skip_list_rep( @@ -1817,6 +1863,11 @@ void rocksdb_options_set_inplace_update_num_locks( opt->rep.inplace_update_num_locks = v; } +void rocksdb_options_set_report_bg_io_stats( + rocksdb_options_t* opt, int v) { + opt->rep.report_bg_io_stats = v; +} + void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { opt->rep.compaction_style = static_cast(style); } @@ -1955,7 +2006,7 @@ void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { delete filter; } -rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_key, bool original_format) { // Make a rocksdb_filterpolicy_t, but override all of its methods so // they delegate to a NewBloomFilterPolicy() instead of user // supplied C functions. @@ -1973,13 +2024,21 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { static void DoNothing(void*) { } }; Wrapper* wrapper = new Wrapper; - wrapper->rep_ = NewBloomFilterPolicy(bits_per_key); + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format); wrapper->state_ = nullptr; wrapper->delete_filter_ = nullptr; wrapper->destructor_ = &Wrapper::DoNothing; return wrapper; } +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(int bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true); +} + rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( void* state, void (*destructor)(void*), char* (*full_merge)(void*, const char* key, size_t key_length, @@ -2056,6 +2115,11 @@ void rocksdb_readoptions_set_tailing( opt->rep.tailing = v; } +void rocksdb_readoptions_set_readahead_size( + rocksdb_readoptions_t* opt, size_t v) { + opt->rep.readahead_size = v; +} + rocksdb_writeoptions_t* rocksdb_writeoptions_create() { return new rocksdb_writeoptions_t; } @@ -2097,6 +2161,10 @@ void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } +void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { + cache->rep->SetCapacity(capacity); +} + rocksdb_env_t* rocksdb_create_default_env() { rocksdb_env_t* result = new rocksdb_env_t; result->rep = Env::Default(); @@ -2104,6 +2172,13 @@ rocksdb_env_t* rocksdb_create_default_env() { return result; } +rocksdb_env_t* rocksdb_create_mem_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = rocksdb::NewMemEnv(Env::Default()); + result->is_default = false; + return result; +} + void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n); } @@ -2315,6 +2390,31 @@ void rocksdb_get_options_from_string(const rocksdb_options_t* base_options, &new_options->rep)); } +void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, db->rep->DefaultColumnFamily(), + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + void rocksdb_free(void* ptr) { free(ptr); } } // end extern "C" diff --git a/external/rocksdb/db/c_test.c b/external/rocksdb/db/c_test.c index 554362711b..7236e01db0 100644 --- a/external/rocksdb/db/c_test.c +++ b/external/rocksdb/db/c_test.c @@ -322,7 +322,7 @@ int main(int argc, char** argv) { rocksdb_options_set_block_based_table_factory(options, table_options); rocksdb_options_set_compression(options, rocksdb_no_compression); - rocksdb_options_set_compression_options(options, -14, -1, 0); + rocksdb_options_set_compression_options(options, -14, -1, 0, 0); int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, rocksdb_no_compression, rocksdb_no_compression}; rocksdb_options_set_compression_per_level(options, compression_levels, 4); @@ -365,6 +365,24 @@ int main(int argc, char** argv) { rocksdb_backup_engine_create_new_backup(be, db, &err); CheckNoError(err); + // need a change to trigger a new backup + rocksdb_delete(db, woptions, "does-not-exist", 14, &err); + CheckNoError(err); + + rocksdb_backup_engine_create_new_backup(be, db, &err); + CheckNoError(err); + + const rocksdb_backup_engine_info_t* bei = rocksdb_backup_engine_get_backup_info(be); + CheckCondition(rocksdb_backup_engine_info_count(bei) > 1); + rocksdb_backup_engine_info_destroy(bei); + + rocksdb_backup_engine_purge_old_backups(be, 1, &err); + CheckNoError(err); + + bei = rocksdb_backup_engine_get_backup_info(be); + CheckCondition(rocksdb_backup_engine_info_count(bei) == 1); + rocksdb_backup_engine_info_destroy(bei); + rocksdb_delete(db, woptions, "foo", 3, &err); CheckNoError(err); @@ -768,6 +786,30 @@ int main(int argc, char** argv) { CheckNoError(err); rocksdb_iter_destroy(iter); + rocksdb_column_family_handle_t* iters_cf_handles[2] = { handles[0], handles[1] }; + rocksdb_iterator_t* iters_handles[2]; + rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, &err); + CheckNoError(err); + + iter = iters_handles[0]; + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_destroy(iter); + + iter = iters_handles[1]; + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + + for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) { + i++; + } + CheckCondition(i == 1); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + rocksdb_drop_column_family(db, handles[1], &err); CheckNoError(err); for (i = 0; i < 2; i++) { diff --git a/external/rocksdb/db/column_family.cc b/external/rocksdb/db/column_family.cc index 88bf0339bc..dcc94f9692 100644 --- a/external/rocksdb/db/column_family.cc +++ b/external/rocksdb/db/column_family.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,10 +26,9 @@ #include "db/table_properties_collector.h" #include "db/version_set.h" #include "db/write_controller.h" -#include "db/writebuffer.h" +#include "memtable/hash_skiplist_rep.h" #include "util/autovector.h" #include "util/compression.h" -#include "util/hash_skiplist_rep.h" #include "util/options_helper.h" #include "util/thread_status_util.h" #include "util/xfunc.h" @@ -68,6 +67,20 @@ const std::string& ColumnFamilyHandleImpl::GetName() const { return cfd()->GetName(); } +Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) { +#ifndef ROCKSDB_LITE + // accessing mutable cf-options requires db mutex. + InstrumentedMutexLock l(mutex_); + *desc = ColumnFamilyDescriptor( + cfd()->GetName(), + BuildColumnFamilyOptions(*cfd()->options(), + *cfd()->GetLatestMutableCFOptions())); + return Status::OK(); +#else + return Status::NotSupported(); +#endif // !ROCKSDB_LITE +} + const Comparator* ColumnFamilyHandleImpl::user_comparator() const { return cfd()->user_comparator(); } @@ -110,18 +123,28 @@ Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) { return Status::OK(); } +Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { + if (cf_options.inplace_update_support) { + return Status::InvalidArgument( + "In-place memtable updates (inplace_update_support) is not compatible " + "with concurrent writes (allow_concurrent_memtable_write)"); + } + if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { + return Status::InvalidArgument( + "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)"); + } + return Status::OK(); +} + ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, const InternalKeyComparator* icmp, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; result.comparator = icmp; -#ifdef OS_MACOSX - // TODO(icanadi) make write_buffer_size uint64_t instead of size_t - ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30); -#else - ClipToRange(&result.write_buffer_size, - ((size_t)64) << 10, ((size_t)64) << 30); -#endif + size_t clamp_max = std::conditional< + sizeof(size_t) == 4, std::integral_constant, + std::integral_constant>::type::value; + ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max); // if user sets arena_block_size, we trust user to use this value. Otherwise, // calculate a proper value from writer_buffer_size; if (result.arena_block_size <= 0) { @@ -148,6 +171,12 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, if (result.max_write_buffer_number_to_maintain < 0) { result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; } + // bloom filter size shouldn't exceed 1/4 of memtable size. + if (result.memtable_prefix_bloom_size_ratio > 0.25) { + result.memtable_prefix_bloom_size_ratio = 0.25; + } else if (result.memtable_prefix_bloom_size_ratio < 0) { + result.memtable_prefix_bloom_size_ratio = 0; + } XFUNC_TEST("memtablelist_history", "transaction_xftest_SanitizeOptions", xf_transaction_set_memtable_history1, xf_transaction_set_memtable_history, @@ -175,6 +204,12 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, result.level0_stop_writes_trigger = std::numeric_limits::max(); } + if (result.level0_file_num_compaction_trigger == 0) { + Warn(db_options.info_log.get(), + "level0_file_num_compaction_trigger cannot be 0"); + result.level0_file_num_compaction_trigger = 1; + } + if (result.level0_stop_writes_trigger < result.level0_slowdown_writes_trigger || result.level0_slowdown_writes_trigger < @@ -205,6 +240,17 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, result.level0_slowdown_writes_trigger, result.level0_file_num_compaction_trigger); } + + if (result.soft_pending_compaction_bytes_limit == 0) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } else if (result.hard_pending_compaction_bytes_limit > 0 && + result.soft_pending_compaction_bytes_limit > + result.hard_pending_compaction_bytes_limit) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } + if (result.level_compaction_dynamic_level_bytes) { if (result.compaction_style != kCompactionStyleLevel || db_options.db_paths.size() > 1U) { @@ -283,7 +329,7 @@ void SuperVersionUnrefHandle(void* ptr) { ColumnFamilyData::ColumnFamilyData( uint32_t id, const std::string& name, Version* _dummy_versions, - Cache* _table_cache, WriteBuffer* write_buffer, + Cache* _table_cache, WriteBufferManager* write_buffer_manager, const ColumnFamilyOptions& cf_options, const DBOptions* db_options, const EnvOptions& env_options, ColumnFamilySet* column_family_set) : id_(id), @@ -297,7 +343,7 @@ ColumnFamilyData::ColumnFamilyData( SanitizeOptions(*db_options, &internal_comparator_, cf_options)), ioptions_(options_), mutable_cf_options_(options_, ioptions_), - write_buffer_(write_buffer), + write_buffer_manager_(write_buffer_manager), mem_(nullptr), imm_(options_.min_write_buffer_number_to_merge, options_.max_write_buffer_number_to_maintain), @@ -309,7 +355,8 @@ ColumnFamilyData::ColumnFamilyData( log_number_(0), column_family_set_(column_family_set), pending_flush_(false), - pending_compaction_(false) { + pending_compaction_(false), + prev_compaction_needed_bytes_(0) { Ref(); // Convert user defined table properties collector factories to internal ones. @@ -427,13 +474,81 @@ void ColumnFamilyData::SetDropped() { column_family_set_->RemoveColumnFamily(this); } +const double kSlowdownRatio = 1.2; + +namespace { +std::unique_ptr SetupDelay( + uint64_t max_write_rate, WriteController* write_controller, + uint64_t compaction_needed_bytes, uint64_t prev_compaction_neeed_bytes, + bool auto_comapctions_disabled) { + const uint64_t kMinWriteRate = 1024u; // Minimum write rate 1KB/s. + + uint64_t write_rate = write_controller->delayed_write_rate(); + + if (auto_comapctions_disabled) { + // When auto compaction is disabled, always use the value user gave. + write_rate = max_write_rate; + } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) { + // If user gives rate less than kMinWriteRate, don't adjust it. + // + // If already delayed, need to adjust based on previous compaction debt. + // When there are two or more column families require delay, we always + // increase or reduce write rate based on information for one single + // column family. It is likely to be OK but we can improve if there is a + // problem. + // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes + // is only available in level-based compaction + // + // If the compaction debt stays the same as previously, we also further slow + // down. It usually means a mem table is full. It's mainly for the case + // where both of flush and compaction are much slower than the speed we + // insert to mem tables, so we need to actively slow down before we get + // feedback signal from compaction and flushes to avoid the full stop + // because of hitting the max write buffer number. + if (prev_compaction_neeed_bytes > 0 && + prev_compaction_neeed_bytes <= compaction_needed_bytes) { + write_rate = static_cast(static_cast(write_rate) / + kSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_neeed_bytes > compaction_needed_bytes) { + // We are speeding up by ratio of kSlowdownRatio when we have paid + // compaction debt. But we'll never speed up to faster than the write rate + // given by users. + write_rate = static_cast(static_cast(write_rate) * + kSlowdownRatio); + if (write_rate > max_write_rate) { + write_rate = max_write_rate; + } + } + } + return write_controller->GetDelayToken(write_rate); +} + +int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, + int level0_slowdown_writes_trigger) { + // SanitizeOptions() ensures it. + assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); + + // 1/4 of the way between L0 compaction trigger threshold and slowdown + // condition. + // Or twice as compaction trigger, if it is smaller. + return std::min(level0_file_num_compaction_trigger * 2, + level0_file_num_compaction_trigger + + (level0_slowdown_writes_trigger - + level0_file_num_compaction_trigger) / + 4); +} +} // namespace + void ColumnFamilyData::RecalculateWriteStallConditions( const MutableCFOptions& mutable_cf_options) { if (current_ != nullptr) { auto* vstorage = current_->storage_info(); - const double score = vstorage->max_compaction_score(); - const int max_level = vstorage->max_compaction_score_level(); auto write_controller = column_family_set_->write_controller_; + uint64_t compaction_needed_bytes = + vstorage->estimated_compaction_needed_bytes(); if (imm()->NumNotFlushed() >= mutable_cf_options.max_write_buffer_number) { write_controller_token_ = write_controller->GetStopToken(); @@ -443,8 +558,9 @@ void ColumnFamilyData::RecalculateWriteStallConditions( "(waiting for flush), max_write_buffer_number is set to %d", name_.c_str(), imm()->NumNotFlushed(), mutable_cf_options.max_write_buffer_number); - } else if (vstorage->l0_delay_trigger_count() >= - mutable_cf_options.level0_stop_writes_trigger) { + } else if (!mutable_cf_options.disable_auto_compactions && + vstorage->l0_delay_trigger_count() >= + mutable_cf_options.level0_stop_writes_trigger) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES_TOTAL, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { @@ -454,38 +570,92 @@ void ColumnFamilyData::RecalculateWriteStallConditions( Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, "[%s] Stopping writes because we have %d level-0 files", name_.c_str(), vstorage->l0_delay_trigger_count()); - } else if (mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && - vstorage->estimated_compaction_needed_bytes() >= + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + compaction_needed_bytes >= mutable_cf_options.hard_pending_compaction_bytes_limit) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats( InternalStats::HARD_PENDING_COMPACTION_BYTES_LIMIT, 1); Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, - "[%s] Stopping writes because estimated pending compaction " - "bytes exceed %" PRIu64, - name_.c_str(), vstorage->estimated_compaction_needed_bytes()); - } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 && + "[%s] Stopping writes because of estimated pending compaction " + "bytes %" PRIu64, + name_.c_str(), compaction_needed_bytes); + } else if (mutable_cf_options.max_write_buffer_number > 3 && + imm()->NumNotFlushed() >= + mutable_cf_options.max_write_buffer_number - 1) { + write_controller_token_ = + SetupDelay(ioptions_.delayed_write_rate, write_controller, + compaction_needed_bytes, prev_compaction_needed_bytes_, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_SLOWDOWN, 1); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stalling writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d " + "rate %" PRIu64, + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number, + write_controller->delayed_write_rate()); + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.level0_slowdown_writes_trigger >= 0 && vstorage->l0_delay_trigger_count() >= mutable_cf_options.level0_slowdown_writes_trigger) { - write_controller_token_ = write_controller->GetDelayToken(); + write_controller_token_ = + SetupDelay(ioptions_.delayed_write_rate, write_controller, + compaction_needed_bytes, prev_compaction_needed_bytes_, + mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN_TOTAL, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( InternalStats::LEVEL0_SLOWDOWN_WITH_COMPACTION, 1); } Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, - "[%s] Stalling writes because we have %d level-0 files", - name_.c_str(), vstorage->l0_delay_trigger_count()); - } else if (mutable_cf_options.soft_rate_limit > 0.0 && - score > mutable_cf_options.soft_rate_limit) { - write_controller_token_ = write_controller->GetDelayToken(); - internal_stats_->RecordLevelNSlowdown(max_level, true); + "[%s] Stalling writes because we have %d level-0 files " + "rate %" PRIu64, + name_.c_str(), vstorage->l0_delay_trigger_count(), + write_controller->delayed_write_rate()); + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.soft_pending_compaction_bytes_limit > 0 && + vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit) { + write_controller_token_ = + SetupDelay(ioptions_.delayed_write_rate, write_controller, + compaction_needed_bytes, prev_compaction_needed_bytes_, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats( + InternalStats::SOFT_PENDING_COMPACTION_BYTES_LIMIT, 1); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stalling writes because of estimated pending compaction " + "bytes %" PRIu64 " rate %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes(), + write_controller->delayed_write_rate()); + } else if (vstorage->l0_delay_trigger_count() >= + GetL0ThresholdSpeedupCompaction( + mutable_cf_options.level0_file_num_compaction_trigger, + mutable_cf_options.level0_slowdown_writes_trigger)) { + write_controller_token_ = write_controller->GetCompactionPressureToken(); Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, - "[%s] Stalling writes because we hit soft limit on level %d", - name_.c_str(), max_level); + "[%s] Increasing compaction threads because we have %d level-0 " + "files ", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { + // Increase compaction threads if bytes needed for compaction exceeds + // 1/4 of threshold for slowing down. + // If soft pending compaction byte limit is not set, always speed up + // compaction. + write_controller_token_ = write_controller->GetCompactionPressureToken(); + if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } } else { write_controller_token_.reset(); } + prev_compaction_needed_bytes_ = compaction_needed_bytes; } } @@ -509,7 +679,7 @@ MemTable* ColumnFamilyData::ConstructNewMemtable( const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { assert(current_ != nullptr); return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, - write_buffer_, earliest_seq); + write_buffer_manager_, earliest_seq); } void ColumnFamilyData::CreateNewMemtable( @@ -539,13 +709,12 @@ const int ColumnFamilyData::kCompactAllLevels = -1; const int ColumnFamilyData::kCompactToBaseLevel = -2; Compaction* ColumnFamilyData::CompactRange( - const MutableCFOptions& mutable_cf_options, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) { + const MutableCFOptions& mutable_cf_options, int input_level, + int output_level, uint32_t output_path_id, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, bool* conflict) { auto* result = compaction_picker_->CompactRange( GetName(), mutable_cf_options, current_->storage_info(), input_level, - output_level, output_path_id, begin, end, compaction_end); + output_level, output_path_id, begin, end, compaction_end, conflict); if (result != nullptr) { result->SetInputVersion(current_); } @@ -689,7 +858,7 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, - WriteBuffer* write_buffer, + WriteBufferManager* write_buffer_manager, WriteController* write_controller) : max_column_family_(0), dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr, @@ -700,7 +869,7 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, db_options_(db_options), env_options_(env_options), table_cache_(table_cache), - write_buffer_(write_buffer), + write_buffer_manager_(write_buffer_manager), write_controller_(write_controller) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; @@ -763,10 +932,9 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( const std::string& name, uint32_t id, Version* dummy_versions, const ColumnFamilyOptions& options) { assert(column_families_.find(name) == column_families_.end()); - ColumnFamilyData* new_cfd = - new ColumnFamilyData(id, name, dummy_versions, table_cache_, - write_buffer_, options, db_options_, - env_options_, this); + ColumnFamilyData* new_cfd = new ColumnFamilyData( + id, name, dummy_versions, table_cache_, write_buffer_manager_, options, + db_options_, env_options_, this); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); max_column_family_ = std::max(max_column_family_, id); @@ -831,13 +999,6 @@ ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { return &handle_; } -void ColumnFamilyMemTablesImpl::CheckMemtableFull() { - if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) { - flush_scheduler_->ScheduleFlush(current_); - current_->mem()->MarkFlushScheduled(); - } -} - uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { uint32_t column_family_id = 0; if (column_family != nullptr) { diff --git a/external/rocksdb/db/column_family.h b/external/rocksdb/db/column_family.h index e44873c7ac..43f249b257 100644 --- a/external/rocksdb/db/column_family.h +++ b/external/rocksdb/db/column_family.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,12 +19,10 @@ #include "db/write_controller.h" #include "db/table_cache.h" #include "db/table_properties_collector.h" -#include "db/flush_scheduler.h" #include "rocksdb/compaction_job_stats.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "util/instrumented_mutex.h" #include "util/mutable_cf_options.h" #include "util/thread_local.h" @@ -44,6 +42,8 @@ class LogBuffer; class InstrumentedMutex; class InstrumentedMutexLock; +extern const double kSlowdownRatio; + // ColumnFamilyHandleImpl is the class that clients use to access different // column families. It has non-trivial destructor, which gets called when client // is done using the column family @@ -59,6 +59,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle { virtual uint32_t GetID() const override; virtual const std::string& GetName() const override; + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; private: ColumnFamilyData* cfd_; @@ -132,6 +133,9 @@ struct SuperVersion { extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); +extern Status CheckConcurrentWritesSupported( + const ColumnFamilyOptions& cf_options); + extern ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, const InternalKeyComparator* icmp, const ColumnFamilyOptions& src); @@ -156,14 +160,16 @@ class ColumnFamilyData { // thread-safe const std::string& GetName() const { return name_; } - // Ref() can only be called whily holding a DB mutex or during a - // single-threaded write. + // Ref() can only be called from a context where the caller can guarantee + // that ColumnFamilyData is alive (while holding a non-zero ref already, + // holding a DB mutex, or as the leader in a write batch group). void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); } - // will just decrease reference count to 0, but will not delete it. returns - // true if the ref count was decreased to zero. in that case, it can be - // deleted by the caller immediately, or later, by calling - // FreeDeadColumnFamilies() - // Unref() can only be called while holding a DB mutex + + // Unref decreases the reference count, but does not handle deletion + // when the count goes to 0. If this method returns true then the + // caller should delete the instance immediately, or later, by calling + // FreeDeadColumnFamilies(). Unref() can only be called while holding + // a DB mutex, or during single-threaded recovery. bool Unref() { int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed); assert(old_refs > 0); @@ -203,7 +209,7 @@ class ColumnFamilyData { const ImmutableCFOptions* ioptions() const { return &ioptions_; } // REQUIRES: DB mutex held // This returns the MutableCFOptions used by current SuperVersion - // You shoul use this API to reference MutableCFOptions most of the time. + // You should use this API to reference MutableCFOptions most of the time. const MutableCFOptions* GetCurrentMutableCFOptions() const { return &(super_version_->mutable_cf_options); } @@ -224,7 +230,7 @@ class ColumnFamilyData { MemTable* mem() { return mem_; } Version* current() { return current_; } Version* dummy_versions() { return dummy_versions_; } - void SetCurrent(Version* current); + void SetCurrent(Version* _current); uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } @@ -249,11 +255,11 @@ class ColumnFamilyData { // A flag to tell a manual compaction's output is base level. static const int kCompactToBaseLevel; // REQUIRES: DB mutex held - Compaction* CompactRange( - const MutableCFOptions& mutable_cf_options, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end); + Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, + int input_level, int output_level, + uint32_t output_path_id, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, + bool* manual_conflict); CompactionPicker* compaction_picker() { return compaction_picker_.get(); } // thread-safe @@ -305,15 +311,6 @@ class ColumnFamilyData { bool pending_flush() { return pending_flush_; } bool pending_compaction() { return pending_compaction_; } - private: - friend class ColumnFamilySet; - ColumnFamilyData(uint32_t id, const std::string& name, - Version* dummy_versions, Cache* table_cache, - WriteBuffer* write_buffer, - const ColumnFamilyOptions& options, - const DBOptions* db_options, const EnvOptions& env_options, - ColumnFamilySet* column_family_set); - // Recalculate some small conditions, which are changed only during // compaction, adding new memtable and/or // recalculation of compaction score. These values are used in @@ -322,6 +319,15 @@ class ColumnFamilyData { void RecalculateWriteStallConditions( const MutableCFOptions& mutable_cf_options); + private: + friend class ColumnFamilySet; + ColumnFamilyData(uint32_t id, const std::string& name, + Version* dummy_versions, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& options, + const DBOptions* db_options, const EnvOptions& env_options, + ColumnFamilySet* column_family_set); + uint32_t id_; const std::string name_; Version* dummy_versions_; // Head of circular doubly-linked list of versions. @@ -342,7 +348,7 @@ class ColumnFamilyData { std::unique_ptr internal_stats_; - WriteBuffer* write_buffer_; + WriteBufferManager* write_buffer_manager_; MemTable* mem_; MemTableList imm_; @@ -382,6 +388,8 @@ class ColumnFamilyData { // If true --> this ColumnFamily is currently present in // DBImpl::compaction_queue_ bool pending_compaction_; + + uint64_t prev_compaction_needed_bytes_; }; // ColumnFamilySet has interesting thread-safety requirements @@ -430,7 +438,8 @@ class ColumnFamilySet { ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, - WriteBuffer* write_buffer, WriteController* write_controller); + WriteBufferManager* write_buffer_manager, + WriteController* write_controller); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -457,6 +466,8 @@ class ColumnFamilySet { // Don't call while iterating over ColumnFamilySet void FreeDeadColumnFamilies(); + Cache* get_table_cache() { return table_cache_; } + private: friend class ColumnFamilyData; // helper function that gets called from cfd destructor @@ -485,7 +496,7 @@ class ColumnFamilySet { const DBOptions* const db_options_; const EnvOptions env_options_; Cache* table_cache_; - WriteBuffer* write_buffer_; + WriteBufferManager* write_buffer_manager_; WriteController* write_controller_; }; @@ -493,15 +504,18 @@ class ColumnFamilySet { // memtables of different column families (specified by ID in the write batch) class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { public: - explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set, - FlushScheduler* flush_scheduler) - : column_family_set_(column_family_set), - current_(nullptr), - flush_scheduler_(flush_scheduler) {} + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} + + // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed + // with the arguments used to construct *orig. + explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) + : column_family_set_(orig->column_family_set_), current_(nullptr) {} // sets current_ to ColumnFamilyData with column_family_id // returns false if column family doesn't exist - // REQUIRES: under a DB mutex OR from a write thread + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread bool Seek(uint32_t column_family_id) override; // Returns log number of the selected column family @@ -509,20 +523,23 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { uint64_t GetLogNumber() const override; // REQUIRES: Seek() called first - // REQUIRES: under a DB mutex OR from a write thread + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread virtual MemTable* GetMemTable() const override; // Returns column family handle for the selected column family - // REQUIRES: under a DB mutex OR from a write thread + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; - // REQUIRES: under a DB mutex OR from a write thread - virtual void CheckMemtableFull() override; + // Cannot be called while another thread is calling Seek(). + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyData* current() override { return current_; } private: ColumnFamilySet* column_family_set_; ColumnFamilyData* current_; - FlushScheduler* flush_scheduler_; ColumnFamilyHandleInternal handle_; }; diff --git a/external/rocksdb/db/column_family_test.cc b/external/rocksdb/db/column_family_test.cc index a258b83df4..99bfbfb1ab 100644 --- a/external/rocksdb/db/column_family_test.cc +++ b/external/rocksdb/db/column_family_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,20 +13,23 @@ #include #include "db/db_impl.h" +#include "db/db_test_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" +#include "util/coding.h" +#include "util/fault_injection_test_env.h" +#include "util/options_parser.h" #include "util/string_util.h" +#include "util/sync_point.h" #include "util/testharness.h" #include "util/testutil.h" -#include "util/coding.h" -#include "util/sync_point.h" #include "utilities/merge_operators.h" -#if !(defined NDEBUG) || !defined(OS_WIN) - namespace rocksdb { +static const int kValueSize = 1000; + namespace { std::string RandomString(Random* rnd, int len) { std::string r; @@ -59,17 +62,94 @@ class ColumnFamilyTest : public testing::Test { env_ = new EnvCounter(Env::Default()); dbname_ = test::TmpDir() + "/column_family_test"; db_options_.create_if_missing = true; + db_options_.fail_if_options_file_error = true; db_options_.env = env_; DestroyDB(dbname_, Options(db_options_, column_family_options_)); } ~ColumnFamilyTest() { + Close(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Destroy(); delete env_; } + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + if (k == 0) { + // Ugh. Random seed of 0 used to produce no entropy. This code + // preserves the implementation that was in place when all of the + // magic values in this file were picked. + *storage = std::string(kValueSize, ' '); + return Slice(*storage); + } else { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } + } + + void Build(int base, int n, int flush_every = 0) { + std::string key_space, value_space; + WriteBatch batch; + + for (int i = 0; i < n; i++) { + if (flush_every != 0 && i != 0 && i % flush_every == 0) { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + } + + int keyi = base + i; + Slice key(DBTestBase::Key(keyi)); + + batch.Clear(); + batch.Put(handles_[0], key, Value(keyi, &value_space)); + batch.Put(handles_[1], key, Value(keyi, &value_space)); + batch.Put(handles_[2], key, Value(keyi, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void CheckMissed() { + uint64_t next_expected = 0; + uint64_t missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + for (int cf = 0; cf < 3; cf++) { + next_expected = 0; + Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + in.remove_prefix(3); + if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(static_cast(key), &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + } + + ASSERT_EQ(0, bad_keys); + ASSERT_EQ(0, bad_values); + ASSERT_EQ(0, missed); + (void)correct; + } + void Close() { for (auto h : handles_) { - delete h; + if (h) { + db_->DestroyColumnFamilyHandle(h); + } } handles_.clear(); names_.clear(); @@ -132,13 +212,7 @@ class ColumnFamilyTest : public testing::Test { } void Destroy() { - for (auto h : handles_) { - delete h; - } - handles_.clear(); - names_.clear(); - delete db_; - db_ = nullptr; + Close(); ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); } @@ -149,10 +223,18 @@ class ColumnFamilyTest : public testing::Test { handles_.resize(cfi + cfs.size()); names_.resize(cfi + cfs.size()); for (size_t i = 0; i < cfs.size(); ++i) { - ASSERT_OK(db_->CreateColumnFamily( - options.size() == 0 ? column_family_options_ : options[i], cfs[i], - &handles_[cfi])); + const auto& current_cf_opt = + options.size() == 0 ? column_family_options_ : options[i]; + ASSERT_OK( + db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi])); names_[cfi] = cfs[i]; + +#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor + // Verify the CF options of the returned CF handle. + ColumnFamilyDescriptor desc; + ASSERT_OK(handles_[cfi]->GetDescriptor(&desc)); + RocksDBOptionsParser::VerifyCFOptions(desc.options, current_cf_opt); +#endif // !ROCKSDB_LITE cfi++; } } @@ -177,17 +259,23 @@ class ColumnFamilyTest : public testing::Test { void DropColumnFamilies(const std::vector& cfs) { for (auto cf : cfs) { ASSERT_OK(db_->DropColumnFamily(handles_[cf])); - delete handles_[cf]; + db_->DestroyColumnFamilyHandle(handles_[cf]); handles_[cf] = nullptr; names_[cf] = ""; } } - void PutRandomData(int cf, int num, int key_value_size) { + void PutRandomData(int cf, int num, int key_value_size, bool save = false) { for (int i = 0; i < num; ++i) { // 10 bytes for key, rest is value - ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10), - RandomString(&rnd_, key_value_size - 10))); + if (!save) { + ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11), + RandomString(&rnd_, key_value_size - 10))); + } else { + std::string key = test::RandomKey(&rnd_, 11); + keys_.insert(key); + ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10))); + } } } @@ -377,6 +465,7 @@ class ColumnFamilyTest : public testing::Test { std::vector handles_; std::vector names_; + std::set keys_; ColumnFamilyOptions column_family_options_; DBOptions db_options_; std::string dbname_; @@ -412,6 +501,135 @@ TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) { } } +class FlushEmptyCFTestWithParam : public ColumnFamilyTest, + public testing::WithParamInterface { + public: + FlushEmptyCFTestWithParam() { allow_2pc_ = GetParam(); } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool allow_2pc_; +}; + +TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + db_options_.allow_2pc = allow_2pc_; + Open(); + CreateColumnFamilies({"one", "two"}); + // Generate log file A. + ASSERT_OK(Put(1, "foo", "v1")); // seqID 1 + + Reopen(); + // Log file A is not dropped after reopening because default column family's + // min log number is 0. + // It flushes to SST file X + ASSERT_OK(Put(1, "foo", "v1")); // seqID 2 + ASSERT_OK(Put(1, "bar", "v2")); // seqID 3 + // Current log file is file B now. While flushing, a new log file C is created + // and is set to current. Boths' min log number is set to file C in memory, so + // after flushing file B is deleted. At the same time, the min log number of + // default CF is not written to manifest. Log file A still remains. + // Flushed to SST file Y. + Flush(1); + Flush(0); + ASSERT_OK(Put(1, "bar", "v3")); // seqID 4 + ASSERT_OK(Put(1, "foo", "v4")); // seqID 5 + + // Preserve file system state up to here to simulate a crash condition. + fault_env->SetFilesystemActive(false); + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + + Close(); + fault_env->ResetState(); + + // Before opening, there are four files: + // Log file A contains seqID 1 + // Log file C contains seqID 4, 5 + // SST file X contains seqID 1 + // SST file Y contains seqID 2, 3 + // Min log number: + // default CF: 0 + // CF one, two: C + // When opening the DB, all the seqID should be preserved. + Open(names, {}); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + Close(); + + db_options_.env = env_; +} + +TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + db_options_.allow_2pc = allow_2pc_; + Open(); + CreateColumnFamilies({"one", "two"}); + // Generate log file A. + ASSERT_OK(Put(1, "foo", "v1")); // seqID 1 + + Reopen(); + // Log file A is not dropped after reopening because default column family's + // min log number is 0. + // It flushes to SST file X + ASSERT_OK(Put(1, "foo", "v1")); // seqID 2 + ASSERT_OK(Put(1, "bar", "v2")); // seqID 3 + // Current log file is file B now. While flushing, a new log file C is created + // and is set to current. Both CFs' min log number is set to file C so after + // flushing file B is deleted. Log file A still remains. + // Flushed to SST file Y. + Flush(1); + ASSERT_OK(Put(0, "bar", "v2")); // seqID 4 + ASSERT_OK(Put(2, "bar", "v2")); // seqID 5 + ASSERT_OK(Put(1, "bar", "v3")); // seqID 6 + // Flushing all column families. This forces all CFs' min log to current. This + // is written to the manifest file. Log file C is cleared. + Flush(0); + Flush(1); + Flush(2); + // Write to log file D + ASSERT_OK(Put(1, "bar", "v4")); // seqID 7 + ASSERT_OK(Put(1, "bar", "v5")); // seqID 8 + // Preserve file system state up to here to simulate a crash condition. + fault_env->SetFilesystemActive(false); + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + + Close(); + fault_env->ResetState(); + // Before opening, there are two logfiles: + // Log file A contains seqID 1 + // Log file D contains seqID 7, 8 + // Min log number: + // default CF: D + // CF one, two: D + // When opening the DB, log file D should be replayed using the seqID + // specified in the file. + Open(names, {}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "bar")); + Close(); + + db_options_.env = env_; +} + +INSTANTIATE_TEST_CASE_P(FlushEmptyCFTestWithParam, FlushEmptyCFTestWithParam, + ::testing::Bool()); + TEST_F(ColumnFamilyTest, AddDrop) { Open(); CreateColumnFamilies({"one", "two", "three"}); @@ -433,7 +651,7 @@ TEST_F(ColumnFamilyTest, AddDrop) { std::vector families; ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); - sort(families.begin(), families.end()); + std::sort(families.begin(), families.end()); ASSERT_TRUE(families == std::vector({"default", "four", "three"})); } @@ -694,6 +912,38 @@ TEST_F(ColumnFamilyTest, LogDeletionTest) { Close(); } +TEST_F(ColumnFamilyTest, CrashAfterFlush) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + Open(); + CreateColumnFamilies({"one"}); + + WriteBatch batch; + batch.Put(handles_[0], Slice("foo"), Slice("bar")); + batch.Put(handles_[1], Slice("foo"), Slice("bar")); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + Flush(0); + fault_env->SetFilesystemActive(false); + + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + fault_env->DropUnsyncedFileData(); + fault_env->ResetState(); + Open(names, {}); + + // Write batch should be atomic. + ASSERT_EQ(Get(0, "foo"), Get(1, "foo")); + + Close(); + db_options_.env = env_; +} + // Makes sure that obsolete log files get deleted TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) { // disable flushing stale column families @@ -721,7 +971,7 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) { two.max_write_buffer_number = 10; two.min_write_buffer_number_to_merge = 3; two.max_write_buffer_number_to_maintain = 2; - three.write_buffer_size = 4096 * 22 + 2048; + three.write_buffer_size = 4096 * 22; three.arena_block_size = 4096; three.max_write_buffer_number = 10; three.min_write_buffer_number_to_merge = 4; @@ -746,15 +996,15 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) { env_->SleepForMicroseconds(micros_wait_for_flush); AssertNumberOfImmutableMemtables({0, 1, 2, 0}); AssertCountLiveLogFiles(4); - PutRandomData(3, 91, 990); + PutRandomData(3, 93, 990); env_->SleepForMicroseconds(micros_wait_for_flush); AssertNumberOfImmutableMemtables({0, 1, 2, 1}); AssertCountLiveLogFiles(5); - PutRandomData(3, 90, 990); + PutRandomData(3, 88, 990); env_->SleepForMicroseconds(micros_wait_for_flush); AssertNumberOfImmutableMemtables({0, 1, 2, 2}); AssertCountLiveLogFiles(6); - PutRandomData(3, 90, 990); + PutRandomData(3, 88, 990); env_->SleepForMicroseconds(micros_wait_for_flush); AssertNumberOfImmutableMemtables({0, 1, 2, 3}); AssertCountLiveLogFiles(7); @@ -766,11 +1016,11 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) { WaitForFlush(2); AssertNumberOfImmutableMemtables({0, 1, 0, 3}); AssertCountLiveLogFiles(9); - PutRandomData(3, 90, 990); + PutRandomData(3, 88, 990); WaitForFlush(3); AssertNumberOfImmutableMemtables({0, 1, 0, 0}); AssertCountLiveLogFiles(10); - PutRandomData(3, 90, 990); + PutRandomData(3, 88, 990); env_->SleepForMicroseconds(micros_wait_for_flush); AssertNumberOfImmutableMemtables({0, 1, 0, 1}); AssertCountLiveLogFiles(11); @@ -778,9 +1028,9 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) { WaitForFlush(1); AssertNumberOfImmutableMemtables({0, 0, 0, 1}); AssertCountLiveLogFiles(5); - PutRandomData(3, 90 * 3, 990); + PutRandomData(3, 88 * 3, 990); WaitForFlush(3); - PutRandomData(3, 90 * 4, 990); + PutRandomData(3, 88 * 4, 990); WaitForFlush(3); AssertNumberOfImmutableMemtables({0, 0, 0, 0}); AssertCountLiveLogFiles(12); @@ -920,151 +1170,932 @@ TEST_F(ColumnFamilyTest, DifferentCompactionStyles) { Close(); } -#ifndef ROCKSDB_LITE // Tailing interator not supported -namespace { -std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; -} -} // anonymous namespace - -TEST_F(ColumnFamilyTest, NewIteratorsTest) { - // iter == 0 -- no tailing - // iter == 2 -- tailing - for (int iter = 0; iter < 2; ++iter) { - Open(); - CreateColumnFamiliesAndReopen({"one", "two"}); - ASSERT_OK(Put(0, "a", "b")); - ASSERT_OK(Put(1, "b", "a")); - ASSERT_OK(Put(2, "c", "m")); - ASSERT_OK(Put(2, "v", "t")); - std::vector iterators; - ReadOptions options; - options.tailing = (iter == 1); - ASSERT_OK(db_->NewIterators(options, handles_, &iterators)); - - for (auto it : iterators) { - it->SeekToFirst(); - } - ASSERT_EQ(IterStatus(iterators[0]), "a->b"); - ASSERT_EQ(IterStatus(iterators[1]), "b->a"); - ASSERT_EQ(IterStatus(iterators[2]), "c->m"); - - ASSERT_OK(Put(1, "x", "x")); - - for (auto it : iterators) { - it->Next(); - } - - ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); - if (iter == 0) { - // no tailing - ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); - } else { - // tailing - ASSERT_EQ(IterStatus(iterators[1]), "x->x"); - } - ASSERT_EQ(IterStatus(iterators[2]), "v->t"); - - for (auto it : iterators) { - delete it; - } - Destroy(); - } -} -#endif // !ROCKSDB_LITE +#ifndef ROCKSDB_LITE +// Sync points not supported in RocksDB Lite -#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported -TEST_F(ColumnFamilyTest, ReadOnlyDBTest) { +TEST_F(ColumnFamilyTest, MultipleManualCompactions) { Open(); - CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); - ASSERT_OK(Put(0, "a", "b")); - ASSERT_OK(Put(1, "foo", "bla")); - ASSERT_OK(Put(2, "foo", "blabla")); - ASSERT_OK(Put(3, "foo", "blablabla")); - ASSERT_OK(Put(4, "foo", "blablablabla")); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; - DropColumnFamilies({2}); - Close(); - // open only a subset of column families - AssertOpenReadOnly({"default", "one", "four"}); - ASSERT_EQ("NOT_FOUND", Get(0, "foo")); - ASSERT_EQ("bla", Get(1, "foo")); - ASSERT_EQ("blablablabla", Get(2, "foo")); + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + one.compaction_style = kCompactionStyleUniversal; - // test newiterators - { - std::vector iterators; - ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators)); - for (auto it : iterators) { - it->SeekToFirst(); - } - ASSERT_EQ(IterStatus(iterators[0]), "a->b"); - ASSERT_EQ(IterStatus(iterators[1]), "foo->bla"); - ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla"); - for (auto it : iterators) { - it->Next(); - } - ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); - ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); - ASSERT_EQ(IterStatus(iterators[2]), "(invalid)"); + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; - for (auto it : iterators) { - delete it; - } - } + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; - Close(); - // can't open dropped column family - Status s = OpenReadOnly({"default", "one", "two"}); - ASSERT_TRUE(!s.ok()); + Reopen({default_cf, one, two}); - // Can't open without specifying default column family - s = OpenReadOnly({"one", "four"}); - ASSERT_TRUE(!s.ok()); -} -#endif // !ROCKSDB_LITE + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"}, + {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"}, + {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3"); + } + }); -TEST_F(ColumnFamilyTest, DontRollEmptyLogs) { - Open(); - CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::vector threads; + threads.emplace_back([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); - for (size_t i = 0; i < handles_.size(); ++i) { - PutRandomData(static_cast(i), 10, 100); + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); } - int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls(); - // this will trigger the flushes - for (int i = 0; i <= 4; ++i) { - ASSERT_OK(Flush(i)); + threads.emplace_back([&] { + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1"); + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[2], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5"); + for (auto& t : threads) { + t.join(); } - for (int i = 0; i < 4; ++i) { - WaitForFlush(i); + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; } - int total_new_writable_files = - env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start; - ASSERT_EQ(static_cast(total_new_writable_files), handles_.size() + 1); Close(); } -TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) { +TEST_F(ColumnFamilyTest, AutomaticAndManualCompactions) { Open(); CreateColumnFamilies({"one", "two"}); ColumnFamilyOptions default_cf, one, two; - default_cf.write_buffer_size = 100000; // small write buffer size - default_cf.arena_block_size = 4096; - default_cf.disable_auto_compactions = true; - one.disable_auto_compactions = true; - two.disable_auto_compactions = true; - db_options_.max_total_wal_size = 210000; - - Reopen({default_cf, one, two}); + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + bool cf_1_1 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"}, + {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"}, + {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4"); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3"); + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1"); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); + } + std::thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[2], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5"); + threads.join(); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + Close(); +} + +TEST_F(ColumnFamilyTest, ManualAndAutomaticCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"}, + {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + cf_1_2 = false; + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); + } + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + threads.join(); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + Close(); +} + +TEST_F(ColumnFamilyTest, SameCFManualManualCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"}, + {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"}, + {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"}, + {"ColumnFamilyTest::ManualManual:1", + "ColumnFamilyTest::ManualManual:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2"); + cf_1_2 = false; + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5"); + + WaitForFlush(1); + + // Add more L0 files and force another manual compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + std::thread threads1([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1"); + + threads.join(); + threads1.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 2); + + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + Close(); +} + +TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + cf_1_2 = false; + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + + WaitForFlush(1); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 2); + + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + Close(); +} + +TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleLevel; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + + // SETUP column family "one" -- level style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"}, + {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "ColumnFamilyTest::ManualAuto:3"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + cf_1_2 = false; + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + AssertFilesPerLevel("0,1", 1); + + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + Close(); +} + +// This test checks for automatic getting a conflict if there is a +// manual which has not yet been scheduled. +// The manual compaction waits in NotScheduled +// We generate more files and then trigger an automatic compaction +// This will wait because there is an unscheduled manual compaction. +// Once the conflict is hit, the manual compaction starts and ends +// Then another automatic will start and end. +TEST_F(ColumnFamilyTest, SameCFManualAutomaticConflict) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction()::Conflict", + "ColumnFamilyTest::ManualAutoCon:7"}, + {"ColumnFamilyTest::ManualAutoCon:9", + "ColumnFamilyTest::ManualAutoCon:8"}, + {"ColumnFamilyTest::ManualAutoCon:2", + "ColumnFamilyTest::ManualAutoCon:6"}, + {"ColumnFamilyTest::ManualAutoCon:4", + "ColumnFamilyTest::ManualAutoCon:5"}, + {"ColumnFamilyTest::ManualAutoCon:1", + "ColumnFamilyTest::ManualAutoCon:2"}, + {"ColumnFamilyTest::ManualAutoCon:1", + "ColumnFamilyTest::ManualAutoCon:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:3"); + } else if (cf_1_2) { + cf_1_2 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:2"); + } + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction:NotScheduled", [&](void* arg) { + InstrumentedMutex* mutex = static_cast(arg); + mutex->Unlock(); + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:9"); + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:7"); + mutex->Lock(); + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:6"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:8"); + WaitForFlush(1); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:5"); + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + } + TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 3); + + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + + Close(); +} + +// In this test, we generate enough files to trigger automatic compactions. +// The automatic compaction waits in NonTrivial:AfterRun +// We generate more files and then trigger an automatic compaction +// This will wait because the automatic compaction has files it needs. +// Once the conflict is hit, the automatic compaction starts and ends +// Then the manual will run and end. +TEST_F(ColumnFamilyTest, SameCFAutomaticManualCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + db_options_.max_background_compactions = 3; + db_options_.base_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.source_compaction_factor = 100; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + + bool cf_1_1 = true; + bool cf_1_2 = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"}, + {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"}, + {"CompactionPicker::CompactRange:Conflict", + "ColumnFamilyTest::AutoManual:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2"); + cf_1_2 = false; + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5"); + + // Add another L0 file and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + } + + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1"); + + WaitForCompaction(); + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + // Compare against saved keys + std::set::iterator key_iter = keys_.begin(); + while (key_iter != keys_.end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // Tailing interator not supported +namespace { +std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; +} +} // anonymous namespace + +TEST_F(ColumnFamilyTest, NewIteratorsTest) { + // iter == 0 -- no tailing + // iter == 2 -- tailing + for (int iter = 0; iter < 2; ++iter) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "b", "a")); + ASSERT_OK(Put(2, "c", "m")); + ASSERT_OK(Put(2, "v", "t")); + std::vector iterators; + ReadOptions options; + options.tailing = (iter == 1); + ASSERT_OK(db_->NewIterators(options, handles_, &iterators)); + + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "b->a"); + ASSERT_EQ(IterStatus(iterators[2]), "c->m"); + + ASSERT_OK(Put(1, "x", "x")); + + for (auto it : iterators) { + it->Next(); + } + + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + if (iter == 0) { + // no tailing + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + } else { + // tailing + ASSERT_EQ(IterStatus(iterators[1]), "x->x"); + } + ASSERT_EQ(IterStatus(iterators[2]), "v->t"); + + for (auto it : iterators) { + delete it; + } + Destroy(); + } +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported +TEST_F(ColumnFamilyTest, ReadOnlyDBTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "foo", "bla")); + ASSERT_OK(Put(2, "foo", "blabla")); + ASSERT_OK(Put(3, "foo", "blablabla")); + ASSERT_OK(Put(4, "foo", "blablablabla")); + + DropColumnFamilies({2}); + Close(); + // open only a subset of column families + AssertOpenReadOnly({"default", "one", "four"}); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + ASSERT_EQ("bla", Get(1, "foo")); + ASSERT_EQ("blablablabla", Get(2, "foo")); + + + // test newiterators + { + std::vector iterators; + ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators)); + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "foo->bla"); + ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla"); + for (auto it : iterators) { + it->Next(); + } + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + ASSERT_EQ(IterStatus(iterators[2]), "(invalid)"); + + for (auto it : iterators) { + delete it; + } + } + + Close(); + // can't open dropped column family + Status s = OpenReadOnly({"default", "one", "two"}); + ASSERT_TRUE(!s.ok()); + + // Can't open without specifying default column family + s = OpenReadOnly({"one", "four"}); + ASSERT_TRUE(!s.ok()); +} +#endif // !ROCKSDB_LITE + +TEST_F(ColumnFamilyTest, DontRollEmptyLogs) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + + for (size_t i = 0; i < handles_.size(); ++i) { + PutRandomData(static_cast(i), 10, 100); + } + int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls(); + // this will trigger the flushes + for (int i = 0; i <= 4; ++i) { + ASSERT_OK(Flush(i)); + } + + for (int i = 0; i < 4; ++i) { + WaitForFlush(i); + } + int total_new_writable_files = + env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start; + ASSERT_EQ(static_cast(total_new_writable_files), handles_.size() + 1); + Close(); +} + +TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + default_cf.write_buffer_size = 100000; // small write buffer size + default_cf.arena_block_size = 4096; + default_cf.disable_auto_compactions = true; + one.disable_auto_compactions = true; + two.disable_auto_compactions = true; + db_options_.max_total_wal_size = 210000; + + Reopen({default_cf, one, two}); PutRandomData(2, 1, 10); // 10 bytes for (int i = 0; i < 2; ++i) { @@ -1081,6 +2112,9 @@ TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) { // 3 files for default column families, 1 file for column family [two], zero // files for column family [one], because it's empty AssertCountLiveFiles(4); + + Flush(0); + ASSERT_EQ(0, dbfull()->TEST_total_log_size()); Close(); } @@ -1164,13 +2198,27 @@ TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) { PutRandomData(1, kKeysNum, 100); PutRandomData(2, kKeysNum, 100); - if (iter == 0) { - // Drop CF two - ASSERT_OK(db_->DropColumnFamily(handles_[2])); - } else { - // delete CF two - delete handles_[2]; - handles_[2] = nullptr; + { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[2])); + iterator->SeekToFirst(); + + if (iter == 0) { + // Drop CF two + ASSERT_OK(db_->DropColumnFamily(handles_[2])); + } else { + // delete CF two + db_->DestroyColumnFamilyHandle(handles_[2]); + handles_[2] = nullptr; + } + // Make sure iterator created can still be used. + int count = 0; + for (; iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); } // Add bunch more data to other CFs @@ -1212,10 +2260,12 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) { Reopen({options, options}); rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"VersionSet::LogAndApply::ColumnFamilyDrop:1" + {{"VersionSet::LogAndApply::ColumnFamilyDrop:0", + "FlushJob::WriteLevel0Table"}, + {"VersionSet::LogAndApply::ColumnFamilyDrop:1", "FlushJob::InstallResults"}, {"FlushJob::InstallResults", - "VersionSet::LogAndApply::ColumnFamilyDrop:2", }}); + "VersionSet::LogAndApply::ColumnFamilyDrop:2"}}); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); test::SleepingBackgroundTask sleeping_task; @@ -1258,17 +2308,823 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) { Close(); Destroy(); +} + +#ifndef ROCKSDB_LITE +// skipped as persisting options is not supported in ROCKSDB_LITE +namespace { +std::atomic test_stage(0); +const int kMainThreadStartPersistingOptionsFile = 1; +const int kChildThreadFinishDroppingColumnFamily = 2; +const int kChildThreadWaitingMainThreadPersistOptions = 3; +void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id, + std::vector* comparators) { + while (test_stage < kMainThreadStartPersistingOptionsFile) { + Env::Default()->SleepForMicroseconds(100); + } + cf_test->DropColumnFamilies({cf_id}); + if ((*comparators)[cf_id]) { + delete (*comparators)[cf_id]; + (*comparators)[cf_id] = nullptr; + } + test_stage = kChildThreadFinishDroppingColumnFamily; +} +} // namespace + +TEST_F(ColumnFamilyTest, CreateAndDropRace) { + const int kCfCount = 5; + std::vector cf_opts; + std::vector comparators; + for (int i = 0; i < kCfCount; ++i) { + cf_opts.emplace_back(); + comparators.push_back(new test::SimpleSuffixReverseComparator()); + cf_opts.back().comparator = comparators.back(); + } + db_options_.create_if_missing = true; + db_options_.create_missing_column_families = true; + + auto main_thread_id = std::this_thread::get_id(); + + rocksdb::SyncPoint::GetInstance()->SetCallBack("PersistRocksDBOptions:start", + [&](void* arg) { + auto current_thread_id = std::this_thread::get_id(); + // If it's the main thread hitting this sync-point, then it + // will be blocked until some other thread update the test_stage. + if (main_thread_id == current_thread_id) { + test_stage = kMainThreadStartPersistingOptionsFile; + while (test_stage < kChildThreadFinishDroppingColumnFamily) { + Env::Default()->SleepForMicroseconds(100); + } + } + }); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::EnterUnbatched:Wait", [&](void* arg) { + // This means a thread doing DropColumnFamily() is waiting for + // other thread to finish persisting options. + // In such case, we update the test_stage to unblock the main thread. + test_stage = kChildThreadWaitingMainThreadPersistOptions; + + // Note that based on the test setting, this must not be the + // main thread. + ASSERT_NE(main_thread_id, std::this_thread::get_id()); + }); + + // Create a database with four column families + Open({"default", "one", "two", "three"}, + {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]}); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Start a thread that will drop the first column family + // and its comparator + std::thread drop_cf_thread(DropSingleColumnFamily, this, 1, &comparators); + + DropColumnFamilies({2}); + + drop_cf_thread.join(); + Close(); + Destroy(); + for (auto* comparator : comparators) { + if (comparator) { + delete comparator; + } + } +} +#endif // !ROCKSDB_LITE + +TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { + const uint64_t kBaseRate = 810000u; + db_options_.delayed_write_rate = kBaseRate; + db_options_.base_background_compactions = 2; + db_options_.max_background_compactions = 6; + + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options( + Options(db_options_, column_family_options_), + ImmutableCFOptions(Options(db_options_, column_family_options_))); + + mutable_cf_options.level0_slowdown_writes_trigger = 20; + mutable_cf_options.level0_stop_writes_trigger = 10000; + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + mutable_cf_options.disable_auto_compactions = false; + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(201); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(400); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(450); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(205); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(202); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(201); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(198); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(399); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(599); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(2001); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(3001); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(390); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(100); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->set_l0_delay_trigger_count(100); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(101); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(101); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2 / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(200); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(0); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + mutable_cf_options.disable_auto_compactions = true; + dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->set_l0_delay_trigger_count(50); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(60); + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + mutable_cf_options.disable_auto_compactions = false; + vstorage->set_l0_delay_trigger_count(70); + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(71); + vstorage->TEST_set_estimated_compaction_needed_bytes(501); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); +} + +TEST_F(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) { + db_options_.base_background_compactions = 2; + db_options_.max_background_compactions = 6; + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options( + Options(db_options_, column_family_options_), + ImmutableCFOptions(Options(db_options_, column_family_options_))); + + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 50; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(45); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(6); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 16; + mutable_cf_options.level0_stop_writes_trigger = 30; + + vstorage->set_l0_delay_trigger_count(5); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(3); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); +} + +TEST_F(ColumnFamilyTest, WriteStallTwoColumnFamilies) { + const uint64_t kBaseRate = 810000u; + db_options_.delayed_write_rate = kBaseRate; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options( + Options(db_options_, column_family_options_), + ImmutableCFOptions(Options(db_options_, column_family_options_))); + mutable_cf_options.level0_slowdown_writes_trigger = 20; + mutable_cf_options.level0_stop_writes_trigger = 10000; + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.soft_pending_compaction_bytes_limit = 500; + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(201); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(70); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(800); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(700); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2 / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.2, + dbfull()->TEST_write_controler().delayed_write_rate()); +} + +TEST_F(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { + db_options_.base_background_compactions = 2; + db_options_.max_background_compactions = 6; + column_family_options_.soft_pending_compaction_bytes_limit = 200; + column_family_options_.hard_pending_compaction_bytes_limit = 2000; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options( + Options(db_options_, column_family_options_), + ImmutableCFOptions(Options(db_options_, column_family_options_))); + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 30; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.level0_slowdown_writes_trigger = 16; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(60); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(30); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(70); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(20); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(3); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->set_l0_delay_trigger_count(2); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(0); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); +} + +#ifndef ROCKSDB_LITE +TEST_F(ColumnFamilyTest, FlushCloseWALFiles) { + SpecialEnv env(Env::Default()); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + // Block flush jobs from running + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_F(ColumnFamilyTest, IteratorCloseWALFile1) { + SpecialEnv env(Env::Default()); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + // Create an iterator holding the current super version. + Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]); + // A flush will make `it` hold the last reference of its super version. + Flush(1); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + // Flush jobs will close previous WAL files after finishing. By + // block flush jobs from running, we trigger a condition where + // the iterator destructor should close the WAL files. + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + delete it; + ASSERT_EQ(1, env.num_open_wal_file_.load()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + WaitForFlush(1); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} + +TEST_F(ColumnFamilyTest, IteratorCloseWALFile2) { + SpecialEnv env(Env::Default()); + // Allow both of flush and purge job to schedule. + env.SetBackgroundThreads(2, Env::HIGH); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + // Create an iterator holding the current super version. + ReadOptions ro; + ro.background_purge_on_iterator_cleanup = true; + Iterator* it = db_->NewIterator(ro, handles_[1]); + // A flush will make `it` hold the last reference of its super version. + Flush(1); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"ColumnFamilyTest::IteratorCloseWALFile2:0", + "DBImpl::BGWorkPurge:start"}, + {"ColumnFamilyTest::IteratorCloseWALFile2:2", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + delete it; + ASSERT_EQ(2, env.num_open_wal_file_.load()); + + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); } -} // namespace rocksdb +#ifndef ROCKSDB_LITE // TEST functions are not supported in lite +TEST_F(ColumnFamilyTest, ForwardIteratorCloseWALFile) { + SpecialEnv env(Env::Default()); + // Allow both of flush and purge job to schedule. + env.SetBackgroundThreads(2, Env::HIGH); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(3)); + column_family_options_.level0_file_num_compaction_trigger = 2; + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodar2", "mirko")); + Flush(1); + + // Create an iterator holding the current super version, as well as + // the SST file just flushed. + ReadOptions ro; + ro.tailing = true; + ro.background_purge_on_iterator_cleanup = true; + Iterator* it = db_->NewIterator(ro, handles_[1]); + // A flush will make `it` hold the last reference of its super version. + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodar2", "mirko")); + Flush(1); + + WaitForCompaction(); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"ColumnFamilyTest::IteratorCloseWALFile2:0", + "DBImpl::BGWorkPurge:start"}, + {"ColumnFamilyTest::IteratorCloseWALFile2:2", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + env.delete_count_.store(0); + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + it->Seek(""); + ASSERT_EQ(2, env.num_open_wal_file_.load()); + ASSERT_EQ(0, env.delete_count_.load()); + + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ASSERT_EQ(1, env.delete_count_.load()); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ASSERT_EQ(1, env.delete_count_.load()); + + delete it; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +// Disable on windows because SyncWAL requires env->IsSyncThreadSafe() +// to return true which is not so in unbuffered mode. +#ifndef OS_WIN +TEST_F(ColumnFamilyTest, LogSyncConflictFlush) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + Put(0, "", ""); + Put(1, "foo", "bar"); + + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1", + "ColumnFamilyTest::LogSyncConflictFlush:1"}, + {"ColumnFamilyTest::LogSyncConflictFlush:2", + "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}}); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + std::thread thread([&] { db_->SyncWAL(); }); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1"); + Flush(1); + Put(1, "foo", "bar"); + Flush(1); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2"); + + thread.join(); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Close(); +} #endif +// this test is placed here, because the infrastructure for Column Family +// test is being used to ensure a roll of wal files. +// Basic idea is to test that WAL truncation is being detected and not +// ignored +TEST_F(ColumnFamilyTest, DISABLED_LogTruncationTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + Build(0, 100); + + // Flush the 0th column family to force a roll of the wal log + Flush(0); + + // Add some more entries + Build(100, 100); + + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + + // collect wal files + std::vector logfs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (!(ParseFileName(filenames[i], &number, &type))) continue; + + if (type != kLogFile) continue; + + logfs.push_back(filenames[i]); + } + + std::sort(logfs.begin(), logfs.end()); + ASSERT_GE(logfs.size(), 2); + + // Take the last but one file, and truncate it + std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2]; + std::vector names_save = names_; + + uint64_t fsize; + ASSERT_OK(env_->GetFileSize(fpath, &fsize)); + ASSERT_GT(fsize, 0); + + Close(); + + std::string backup_logs = dbname_ + "/backup_logs"; + std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2]; + + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + // Not sure how easy it is to make this data driven. + // need to read back the WAL file and truncate last 10 + // entries + CopyFile(fpath, t_fpath, fsize - 9180); + + ASSERT_OK(env_->DeleteFile(fpath)); + ASSERT_OK(env_->RenameFile(t_fpath, fpath)); + + db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + OpenReadOnly(names_save); + + CheckMissed(); + + Close(); + + Open(names_save); + + CheckMissed(); + + Close(); + + // cleanup + env_->DeleteDir(backup_logs); +} +} // namespace rocksdb + int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/external/rocksdb/db/compact_files_test.cc b/external/rocksdb/db/compact_files_test.cc index cbd9d7a09d..794defb115 100644 --- a/external/rocksdb/db/compact_files_test.cc +++ b/external/rocksdb/db/compact_files_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,11 +7,13 @@ #include #include +#include #include #include "rocksdb/db.h" #include "rocksdb/env.h" #include "util/string_util.h" +#include "util/sync_point.h" #include "util/testharness.h" namespace rocksdb { @@ -53,6 +55,63 @@ class FlushedFileCollector : public EventListener { std::mutex mutex_; }; +TEST_F(CompactFilesTest, L0ConflictsFiles) { + Options options; + // to trigger compaction more easily + const int kWriteBufferSize = 10000; + const int kLevel0Trigger = 2; + options.create_if_missing = true; + options.compaction_style = kCompactionStyleLevel; + // Small slowdown and stop trigger for experimental purpose. + options.level0_slowdown_writes_trigger = 20; + options.level0_stop_writes_trigger = 20; + options.level0_stop_writes_trigger = 20; + options.write_buffer_size = kWriteBufferSize; + options.level0_file_num_compaction_trigger = kLevel0Trigger; + options.compression = kNoCompression; + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", "BackgroundCallCompaction:0"}, + {"BackgroundCallCompaction:1", "CompactFilesImpl:1"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // create couple files + // Background compaction starts and waits in BackgroundCallCompaction:0 + for (int i = 0; i < kLevel0Trigger * 4; ++i) { + db->Put(WriteOptions(), ToString(i), ""); + db->Put(WriteOptions(), ToString(100 - i), ""); + db->Flush(FlushOptions()); + } + + rocksdb::ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + std::string file1; + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + if (file1 == "") { + file1 = file.db_path + "/" + file.name; + } else { + std::string file2 = file.db_path + "/" + file.name; + // Another thread starts a compact files and creates an L0 compaction + // The background compaction then notices that there is an L0 compaction + // already in progress and doesn't do an L0 compaction + // Once the background compaction finishes, the compact files finishes + ASSERT_OK( + db->CompactFiles(rocksdb::CompactionOptions(), {file1, file2}, 0)); + break; + } + } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + delete db; +} + TEST_F(CompactFilesTest, ObsoleteFiles) { Options options; // to trigger compaction more easily @@ -84,9 +143,6 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { } auto l0_files = collector->GetFlushedFiles(); - CompactionOptions compact_opt; - compact_opt.compression = kNoCompression; - compact_opt.output_file_size_limit = kWriteBufferSize * 5; ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); // verify all compaction input files are deleted @@ -96,6 +152,62 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { delete db; } +TEST_F(CompactFilesTest, CapturingPendingFiles) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + // Always do full scans for obsolete files (needed to reproduce the issue). + options.delete_obsolete_files_period_micros = 0; + + // Add listener. + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // Create 5 files. + for (int i = 0; i < 5; ++i) { + db->Put(WriteOptions(), "key" + ToString(i), "value"); + db->Flush(FlushOptions()); + } + + auto l0_files = collector->GetFlushedFiles(); + EXPECT_EQ(5, l0_files.size()); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"}, + {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Start compacting files. + std::thread compaction_thread( + [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); }); + + // In the meantime flush another file. + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0"); + db->Put(WriteOptions(), "key5", "value"); + db->Flush(FlushOptions()); + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1"); + + compaction_thread.join(); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + delete db; + + // Make sure we can reopen the DB. + s = DB::Open(options, db_name_, &db); + ASSERT_TRUE(s.ok()); + assert(db); + delete db; +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/external/rocksdb/db/compacted_db_impl.cc b/external/rocksdb/db/compacted_db_impl.cc index 980b34e12c..e72a4bd595 100644 --- a/external/rocksdb/db/compacted_db_impl.cc +++ b/external/rocksdb/db/compacted_db_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -93,7 +93,7 @@ Status CompactedDBImpl::Init(const Options& options) { mutex_.Lock(); ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, ColumnFamilyOptions(options)); - Status s = Recover({ cf }, true /* read only */, false); + Status s = Recover({cf}, true /* read only */, false, true); if (s.ok()) { cfd_ = reinterpret_cast( DefaultColumnFamily())->cfd(); diff --git a/external/rocksdb/db/compacted_db_impl.h b/external/rocksdb/db/compacted_db_impl.h index ec2d53762e..9c42010a67 100644 --- a/external/rocksdb/db/compacted_db_impl.h +++ b/external/rocksdb/db/compacted_db_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/compaction.cc b/external/rocksdb/db/compaction.cc index bb806653b3..cb96676c00 100644 --- a/external/rocksdb/db/compaction.cc +++ b/external/rocksdb/db/compaction.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,8 +16,8 @@ #include #include -#include "rocksdb/compaction_filter.h" #include "db/column_family.h" +#include "rocksdb/compaction_filter.h" #include "util/logging.h" #include "util/sync_point.h" @@ -46,7 +46,7 @@ void Compaction::GetBoundaryKeys( Slice* largest_user_key) { bool initialized = false; const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); - for (uint32_t i = 0; i < inputs.size(); ++i) { + for (size_t i = 0; i < inputs.size(); ++i) { if (inputs[i].files.empty()) { continue; } @@ -128,8 +128,8 @@ bool Compaction::TEST_IsBottommostLevel( bool Compaction::IsFullCompaction( VersionStorageInfo* vstorage, const std::vector& inputs) { - int num_files_in_compaction = 0; - int total_num_files = 0; + size_t num_files_in_compaction = 0; + size_t total_num_files = 0; for (int l = 0; l < vstorage->num_levels(); l++) { total_num_files += vstorage->NumLevelFiles(l); } @@ -147,7 +147,8 @@ Compaction::Compaction(VersionStorageInfo* vstorage, uint32_t _output_path_id, CompressionType _compression, std::vector _grandparents, bool _manual_compaction, double _score, - bool _deletion_compaction) + bool _deletion_compaction, + CompactionReason _compaction_reason) : start_level_(_inputs[0].level), output_level_(_output_level), max_output_file_size_(_target_file_size), @@ -161,14 +162,15 @@ Compaction::Compaction(VersionStorageInfo* vstorage, deletion_compaction_(_deletion_compaction), inputs_(std::move(_inputs)), grandparents_(std::move(_grandparents)), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0), score_(_score), bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), is_full_compaction_(IsFullCompaction(vstorage, inputs_)), - is_manual_compaction_(_manual_compaction) { + is_manual_compaction_(_manual_compaction), + compaction_reason_(_compaction_reason) { MarkFilesBeingCompacted(true); + if (is_manual_compaction_) { + compaction_reason_ = CompactionReason::kManualCompaction; + } #ifndef NDEBUG for (size_t i = 1; i < inputs_.size(); ++i) { @@ -184,6 +186,8 @@ Compaction::Compaction(VersionStorageInfo* vstorage, &arena_); } } + + GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_); } Compaction::~Compaction() { @@ -198,9 +202,11 @@ Compaction::~Compaction() { } bool Compaction::InputCompressionMatchesOutput() const { - int base_level = input_version_->storage_info()->base_level(); - bool matches = (GetCompressionType(*cfd_->ioptions(), start_level_, - base_level) == output_compression_); + VersionStorageInfo* vstorage = input_version_->storage_info(); + int base_level = vstorage->base_level(); + bool matches = + (GetCompressionType(*cfd_->ioptions(), vstorage, mutable_cf_options_, + start_level_, base_level) == output_compression_); if (matches) { TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches"); return true; @@ -282,38 +288,12 @@ bool Compaction::KeyNotExistsBeyondOutputLevel( return true; } -bool Compaction::ShouldStopBefore(const Slice& internal_key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &cfd_->internal_comparator(); - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(internal_key, - grandparents_[grandparent_index_]->largest.Encode()) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->fd.GetFileSize(); - } - assert(grandparent_index_ + 1 >= grandparents_.size() || - icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), - grandparents_[grandparent_index_+1]->smallest.Encode()) - < 0); - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > max_grandparent_overlap_bytes_) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - // Mark (or clear) each file that is being compacted void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { for (size_t i = 0; i < num_input_levels(); i++) { - for (unsigned int j = 0; j < inputs_[i].size(); j++) { - assert(mark_as_compacted ? !inputs_[i][j]->being_compacted : - inputs_[i][j]->being_compacted); + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); inputs_[i][j]->being_compacted = mark_as_compacted; } } @@ -371,7 +351,7 @@ int InputSummary(const std::vector& files, char* output, int len) { *output = '\0'; int write = 0; - for (unsigned int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { int sz = len - write; int ret; char sztxt[16]; @@ -388,10 +368,8 @@ int InputSummary(const std::vector& files, char* output, void Compaction::Summary(char* output, int len) { int write = - snprintf(output, len, "Base version %" PRIu64 - " Base level %d, inputs: [", - input_version_->GetVersionNumber(), - start_level_); + snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [", + input_version_->GetVersionNumber(), start_level_); if (write < 0 || write >= len) { return; } @@ -413,7 +391,7 @@ void Compaction::Summary(char* output, int len) { snprintf(output + write, len - write, "]"); } -uint64_t Compaction::OutputFilePreallocationSize() { +uint64_t Compaction::OutputFilePreallocationSize() const { uint64_t preallocation_size = 0; if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel || @@ -428,7 +406,7 @@ uint64_t Compaction::OutputFilePreallocationSize() { } // Over-estimate slightly so we don't end up just barely crossing // the threshold - return preallocation_size * 1.1; + return preallocation_size + (preallocation_size / 10); } std::unique_ptr Compaction::CreateCompactionFilter() const { @@ -439,6 +417,7 @@ std::unique_ptr Compaction::CreateCompactionFilter() const { CompactionFilter::Context context; context.is_full_compaction = is_full_compaction_; context.is_manual_compaction = is_manual_compaction_; + context.column_family_id = cfd_->GetID(); return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( context); } diff --git a/external/rocksdb/db/compaction.h b/external/rocksdb/db/compaction.h index 36c62ff269..a596f313a5 100644 --- a/external/rocksdb/db/compaction.h +++ b/external/rocksdb/db/compaction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -41,7 +41,8 @@ class Compaction { uint32_t output_path_id, CompressionType compression, std::vector grandparents, bool manual_compaction = false, double score = -1, - bool deletion_compaction = false); + bool deletion_compaction = false, + CompactionReason compaction_reason = CompactionReason::kUnknown); // No copying allowed Compaction(const Compaction&) = delete; @@ -103,7 +104,7 @@ class Compaction { } // Returns the LevelFilesBrief of the specified compaction input level. - LevelFilesBrief* input_levels(size_t compaction_input_level) { + const LevelFilesBrief* input_levels(size_t compaction_input_level) const { return &input_levels_[compaction_input_level]; } @@ -131,12 +132,10 @@ class Compaction { bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, std::vector* level_ptrs) const; - // Returns true iff we should stop building the current output - // before processing "internal_key". - bool ShouldStopBefore(const Slice& internal_key); - // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held void ReleaseCompactionFiles(Status status); // Returns the summary of the compaction in "output" with maximum "len" @@ -148,13 +147,13 @@ class Compaction { double score() const { return score_; } // Is this compaction creating a file in the bottom most level? - bool bottommost_level() { return bottommost_level_; } + bool bottommost_level() const { return bottommost_level_; } // Does this compaction include all sst files? - bool is_full_compaction() { return is_full_compaction_; } + bool is_full_compaction() const { return is_full_compaction_; } // Was this compaction triggered manually by the client? - bool is_manual_compaction() { return is_manual_compaction_; } + bool is_manual_compaction() const { return is_manual_compaction_; } // Used when allow_trivial_move option is set in // Universal compaction. If all the input files are @@ -167,19 +166,21 @@ class Compaction { // Used when allow_trivial_move option is set in // Universal compaction. Returns true, if the input files // are non-overlapping and can be trivially moved. - bool is_trivial_move() { return is_trivial_move_; } + bool is_trivial_move() const { return is_trivial_move_; } // How many total levels are there? int number_levels() const { return number_levels_; } // Return the MutableCFOptions that should be used throughout the compaction // procedure - const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; } + const MutableCFOptions* mutable_cf_options() const { + return &mutable_cf_options_; + } // Returns the size in bytes that the output file should be preallocated to. // In level compaction, that is max_file_size_. In universal compaction, that // is the sum of all input file sizes. - uint64_t OutputFilePreallocationSize(); + uint64_t OutputFilePreallocationSize() const; void SetInputVersion(Version* input_version); @@ -210,6 +211,28 @@ class Compaction { int output_level, VersionStorageInfo* vstorage, const std::vector& inputs); + TablePropertiesCollection GetOutputTableProperties() const { + return output_table_properties_; + } + + void SetOutputTableProperties(TablePropertiesCollection tp) { + output_table_properties_ = std::move(tp); + } + + Slice GetSmallestUserKey() const { return smallest_user_key_; } + + Slice GetLargestUserKey() const { return largest_user_key_; } + + CompactionReason compaction_reason() { return compaction_reason_; } + + const std::vector& grandparents() const { + return grandparents_; + } + + uint64_t max_grandparent_overlap_bytes() const { + return max_grandparent_overlap_bytes_; + } + private: // mark (or clear) all files that are being compacted void MarkFilesBeingCompacted(bool mark_as_compacted); @@ -250,13 +273,9 @@ class Compaction { // A copy of inputs_, organized more closely in memory autovector input_levels_; - // State used to check for number of of overlapping grandparent files + // State used to check for number of overlapping grandparent files // (grandparent == "output_level_ + 1") std::vector grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - uint64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files const double score_; // score that was used to pick this compaction. // Is this compaction creating a file in the bottom most level? @@ -273,6 +292,18 @@ class Compaction { // Does input compression match the output compression? bool InputCompressionMatchesOutput() const; + + // table properties of output files + TablePropertiesCollection output_table_properties_; + + // smallest user keys in compaction + Slice smallest_user_key_; + + // largest user keys in compaction + Slice largest_user_key_; + + // Reason for compaction + CompactionReason compaction_reason_; }; // Utility function diff --git a/external/rocksdb/db/compaction_iterator.cc b/external/rocksdb/db/compaction_iterator.cc index d242291ddd..aa160bee78 100644 --- a/external/rocksdb/db/compaction_iterator.cc +++ b/external/rocksdb/db/compaction_iterator.cc @@ -1,23 +1,26 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #include "db/compaction_iterator.h" +#include "table/internal_iterator.h" namespace rocksdb { CompactionIterator::CompactionIterator( - Iterator* input, const Comparator* cmp, MergeHelper* merge_helper, + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, - Env* env, bool expect_valid_internal_key, Compaction* compaction, + SequenceNumber earliest_write_conflict_snapshot, Env* env, + bool expect_valid_internal_key, const Compaction* compaction, const CompactionFilter* compaction_filter, LogBuffer* log_buffer) : input_(input), cmp_(cmp), merge_helper_(merge_helper), snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), env_(env), expect_valid_internal_key_(expect_valid_internal_key), compaction_(compaction), @@ -41,6 +44,17 @@ CompactionIterator::CompactionIterator( earliest_snapshot_ = snapshots_->at(0); latest_snapshot_ = snapshots_->back(); } + if (compaction_filter_ != nullptr && compaction_filter_->IgnoreSnapshots()) { + ignore_snapshots_ = true; + } else { + ignore_snapshots_ = false; + } + input_->SetPinnedItersMgr(&pinned_iters_mgr_); +} + +CompactionIterator::~CompactionIterator() { + // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime + input_->SetPinnedItersMgr(nullptr); } void CompactionIterator::ResetRecordCounts() { @@ -75,6 +89,8 @@ void CompactionIterator::Next() { ikey_.user_key = current_key_.GetUserKey(); valid_ = true; } else { + // We consumed all pinned merge operands, release pinned iterators + pinned_iters_mgr_.ReleasePinnedIterators(); // MergeHelper moves the iterator to the first record after the merged // records, so even though we reached the end of the merge output, we do // not want to advance the iterator. @@ -89,6 +105,11 @@ void CompactionIterator::Next() { NextFromInput(); } + if (valid_) { + // Record that we've ouputted a record for the current key. + has_outputted_key_ = true; + } + PrepareOutput(); } @@ -135,11 +156,14 @@ void CompactionIterator::NextFromInput() { key_ = current_key_.SetKey(key_, &ikey_); current_user_key_ = ikey_.user_key; has_current_user_key_ = true; + has_outputted_key_ = false; current_user_key_sequence_ = kMaxSequenceNumber; current_user_key_snapshot_ = 0; + // apply the compaction filter to the first occurrence of the user key if (compaction_filter_ != nullptr && ikey_.type == kTypeValue && - (visible_at_tip_ || ikey_.sequence > latest_snapshot_)) { + (visible_at_tip_ || ikey_.sequence > latest_snapshot_ || + ignore_snapshots_)) { // If the user has specified a compaction filter and the sequence // number is greater than any external snapshot, then invoke the // filter. If the return value of the compaction filter is true, @@ -169,6 +193,9 @@ void CompactionIterator::NextFromInput() { } else { // Update the current key to reflect the new sequence number/type without // copying the user key. + // TODO(rven): Compaction filter does not process keys in this path + // Need to have the compaction filter process multiple versions + // if we have versions on both sides of a snapshot current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); key_ = current_key_.GetKey(); ikey_.user_key = current_key_.GetUserKey(); @@ -186,31 +213,99 @@ void CompactionIterator::NextFromInput() { visible_at_tip_ ? visible_at_tip_ : findEarliestVisibleSnapshot( ikey_.sequence, &prev_snapshot); - if (ikey_.type == kTypeSingleDeletion) { + if (clear_and_output_next_key_) { + // In the previous iteration we encountered a single delete that we could + // not compact out. We will keep this Put, but can drop it's data. + // (See Optimization 3, below.) + assert(ikey_.type == kTypeValue); + assert(current_user_key_snapshot_ == last_snapshot); + + value_.clear(); + valid_ = true; + clear_and_output_next_key_ = false; + } else if (ikey_.type == kTypeSingleDeletion) { + // We can compact out a SingleDelete if: + // 1) We encounter the corresponding PUT -OR- we know that this key + // doesn't appear past this output level + // =AND= + // 2) We've already returned a record in this snapshot -OR- + // there are no earlier earliest_write_conflict_snapshot. + // + // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to + // allow Transactions to do write-conflict checking (if we compacted away + // all keys, then we wouldn't know that a write happened in this + // snapshot). If there is no earlier snapshot, then we know that there + // are no active transactions that need to know about any writes. + // + // Optimization 3: + // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT + // true, then we must output a SingleDelete. In this case, we will decide + // to also output the PUT. While we are compacting less by outputting the + // PUT now, hopefully this will lead to better compaction in the future + // when Rule 2 is later true (Ie, We are hoping we can later compact out + // both the SingleDelete and the Put, while we couldn't if we only + // outputted the SingleDelete now). + // In this case, we can save space by removing the PUT's value as it will + // never be read. + // + // Deletes and Merges are not supported on the same key that has a + // SingleDelete as it is not possible to correctly do any partial + // compaction of such a combination of operations. The result of mixing + // those operations for a given key is documented as being undefined. So + // we can choose how to handle such a combinations of operations. We will + // try to compact out as much as we can in these cases. + + // The easiest way to process a SingleDelete during iteration is to peek + // ahead at the next key. ParsedInternalKey next_ikey; input_->Next(); - // Check whether the current key is valid, not corrupt and the same + // Check whether the next key exists, is not corrupt, and is the same key // as the single delete. if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { - // Mixing single deletes and merges is not supported. Consecutive - // single deletes are not valid. - if (next_ikey.type != kTypeValue) { - assert(false); - status_ = - Status::InvalidArgument("Put expected after single delete."); - break; - } - - // Check whether the current key belongs to the same snapshot as the - // single delete. + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. if (prev_snapshot == 0 || next_ikey.sequence > prev_snapshot) { - // Found the matching value, we can drop the single delete and the - // value. - ++iter_stats_.num_record_drop_hidden; - ++iter_stats_.num_record_drop_obsolete; - input_->Next(); + if (next_ikey.type == kTypeSingleDeletion) { + // We encountered two SingleDeletes in a row. This could be due to + // unexpected user input. + // Skip the first SingleDelete and let the next iteration decide how + // to handle the second SingleDelete + + // First SingleDelete has been skipped since we already called + // input_->Next(). + ++iter_stats_.num_record_drop_obsolete; + } else if ((ikey_.sequence <= earliest_write_conflict_snapshot_) || + has_outputted_key_) { + // Found a matching value, we can drop the single delete and the + // value. It is safe to drop both records since we've already + // outputted a key in this snapshot, or there is no earlier + // snapshot (Rule 2 above). + + // Note: it doesn't matter whether the second key is a Put or if it + // is an unexpected Merge or Delete. We will compact it out + // either way. + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + // Already called input_->Next() once. Call it a second time to + // skip past the second key. + input_->Next(); + } else { + // Found a matching value, but we cannot drop both keys since + // there is an earlier snapshot and we need to leave behind a record + // to know that a write happened in this snapshot (Rule 2 above). + // Clear the value and output the SingleDelete. (The value will be + // outputted on the next iteration.) + ++iter_stats_.num_record_drop_hidden; + + // Setting valid_ to true will output the current SingleDelete + valid_ = true; + + // Set up the Put to be outputted in the next iteration. + // (Optimization 3). + clear_and_output_next_key_ = true; + } } else { // We hit the next snapshot without hitting a put, so the iterator // returns the single delete. @@ -225,11 +320,14 @@ void CompactionIterator::NextFromInput() { // iteration. If the next key is corrupt, we return before the // comparison, so the value of has_current_user_key does not matter. has_current_user_key_ = false; - if (compaction_ != nullptr && + if (compaction_ != nullptr && ikey_.sequence <= earliest_snapshot_ && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, &level_ptrs_)) { + // Key doesn't exist outside of this range. + // Can compact out this SingleDelete. ++iter_stats_.num_record_drop_obsolete; } else { + // Output SingleDelete valid_ = true; } } @@ -243,6 +341,10 @@ void CompactionIterator::NextFromInput() { // same key, then this kv is not visible in any snapshot. // Hidden by an newer entry for same user key // TODO: why not > ? + // + // Note: Dropping this key will not affect TransactionDB write-conflict + // checking since there has already been a record returned for this key + // in this snapshot. assert(last_sequence >= current_user_key_sequence_); ++iter_stats_.num_record_drop_hidden; // (A) input_->Next(); @@ -261,6 +363,9 @@ void CompactionIterator::NextFromInput() { // smaller sequence numbers will be dropped in the next // few iterations of this loop (by rule (A) above). // Therefore this deletion marker is obsolete and can be dropped. + // + // Note: Dropping this Delete will not affect TransactionDB + // write-conflict checking since it is earlier than any snapshot. ++iter_stats_.num_record_drop_obsolete; input_->Next(); } else if (ikey_.type == kTypeMerge) { @@ -271,6 +376,7 @@ void CompactionIterator::NextFromInput() { return; } + pinned_iters_mgr_.StartPinning(); // We know the merge type entry is not hidden, otherwise we would // have hit (A) // We encapsulate the merge related state machine in a different @@ -298,6 +404,7 @@ void CompactionIterator::NextFromInput() { // batch consumed by the merge operator should not shadow any keys // coming after the merges has_current_user_key_ = false; + pinned_iters_mgr_.ReleasePinnedIterators(); } } else { valid_ = true; @@ -309,9 +416,14 @@ void CompactionIterator::PrepareOutput() { // Zeroing out the sequence number leads to better compression. // If this is the bottommost level (no files in lower levels) // and the earliest snapshot is larger than this seqno + // and the userkey differs from the last userkey in compaction // then we can squash the seqno to zero. + + // This is safe for TransactionDB write-conflict checking since transactions + // only care about sequence number larger than any active snapshots. if (bottommost_level_ && valid_ && ikey_.sequence < earliest_snapshot_ && - ikey_.type != kTypeMerge) { + ikey_.type != kTypeMerge && + !cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) { assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion); ikey_.sequence = 0; current_key_.UpdateInternalKey(0, ikey_.type); @@ -321,15 +433,15 @@ void CompactionIterator::PrepareOutput() { inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( SequenceNumber in, SequenceNumber* prev_snapshot) { assert(snapshots_->size()); - SequenceNumber prev __attribute__((unused)) = 0; + SequenceNumber prev __attribute__((__unused__)) = kMaxSequenceNumber; for (const auto cur : *snapshots_) { - assert(prev <= cur); + assert(prev == kMaxSequenceNumber || prev <= cur); if (cur >= in) { - *prev_snapshot = prev; + *prev_snapshot = prev == kMaxSequenceNumber ? 0 : prev; return cur; } prev = cur; - assert(prev); + assert(prev < kMaxSequenceNumber); } *prev_snapshot = prev; return kMaxSequenceNumber; diff --git a/external/rocksdb/db/compaction_iterator.h b/external/rocksdb/db/compaction_iterator.h index da242f6aa8..01f677b147 100644 --- a/external/rocksdb/db/compaction_iterator.h +++ b/external/rocksdb/db/compaction_iterator.h @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,6 +13,7 @@ #include "db/compaction.h" #include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" #include "rocksdb/compaction_filter.h" #include "util/log_buffer.h" @@ -37,14 +38,17 @@ struct CompactionIteratorStats { class CompactionIterator { public: - CompactionIterator(Iterator* input, const Comparator* cmp, + CompactionIterator(InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, - std::vector* snapshots, Env* env, + std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, Env* env, bool expect_valid_internal_key, - Compaction* compaction = nullptr, + const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, LogBuffer* log_buffer = nullptr); + ~CompactionIterator(); + void ResetRecordCounts(); // Seek to the beginning of the compaction iterator output. @@ -84,13 +88,14 @@ class CompactionIterator { inline SequenceNumber findEarliestVisibleSnapshot( SequenceNumber in, SequenceNumber* prev_snapshot); - Iterator* input_; + InternalIterator* input_; const Comparator* cmp_; MergeHelper* merge_helper_; const std::vector* snapshots_; + const SequenceNumber earliest_write_conflict_snapshot_; Env* env_; bool expect_valid_internal_key_; - Compaction* compaction_; + const Compaction* compaction_; const CompactionFilter* compaction_filter_; LogBuffer* log_buffer_; bool bottommost_level_; @@ -98,6 +103,7 @@ class CompactionIterator { SequenceNumber visible_at_tip_; SequenceNumber earliest_snapshot_; SequenceNumber latest_snapshot_; + bool ignore_snapshots_; // State // @@ -124,7 +130,18 @@ class CompactionIterator { Slice current_user_key_; SequenceNumber current_user_key_sequence_; SequenceNumber current_user_key_snapshot_; + + // True if the iterator has already returned a record for the current key. + bool has_outputted_key_ = false; + + // truncated the value of the next key and output it without applying any + // compaction rules. This is used for outputting a put after a single delete. + bool clear_and_output_next_key_ = false; + MergeOutputIterator merge_out_iter_; + // PinnedIteratorsManager used to pin input_ Iterator blocks while reading + // merge operands and then releasing them after consuming them. + PinnedIteratorsManager pinned_iters_mgr_; std::string compaction_filter_value_; // "level_ptrs" holds indices that remember which file of an associated // level we were last checking during the last call to compaction-> diff --git a/external/rocksdb/db/compaction_iterator_test.cc b/external/rocksdb/db/compaction_iterator_test.cc index 1148c2ac7a..4cbccca55e 100644 --- a/external/rocksdb/db/compaction_iterator_test.cc +++ b/external/rocksdb/db/compaction_iterator_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,9 +20,9 @@ class CompactionIteratorTest : public testing::Test { nullptr, 0U, false, 0)); iter_.reset(new test::VectorIterator(ks, vs)); iter_->SeekToFirst(); - c_iter_.reset(new CompactionIterator(iter_.get(), cmp_, merge_helper_.get(), - last_sequence, &snapshots_, - Env::Default(), false)); + c_iter_.reset(new CompactionIterator( + iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, + kMaxSequenceNumber, Env::Default(), false)); } const Comparator* cmp_; diff --git a/external/rocksdb/db/compaction_job.cc b/external/rocksdb/db/compaction_job.cc index 4d6656d4e5..d86167d5fd 100644 --- a/external/rocksdb/db/compaction_job.cc +++ b/external/rocksdb/db/compaction_job.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,12 +16,13 @@ #include #include #include -#include -#include #include +#include +#include #include #include #include +#include #include "db/builder.h" #include "db/db_iter.h" @@ -51,6 +52,7 @@ #include "util/iostats_context_imp.h" #include "util/log_buffer.h" #include "util/logging.h" +#include "util/sst_file_manager_impl.h" #include "util/mutexlock.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" @@ -62,7 +64,7 @@ namespace rocksdb { // Maintains state for each sub-compaction struct CompactionJob::SubcompactionState { - Compaction* compaction; + const Compaction* compaction; std::unique_ptr c_iter; // The boundaries of the key-range this compaction is interested in. No two @@ -77,6 +79,7 @@ struct CompactionJob::SubcompactionState { struct Output { FileMetaData meta; bool finished; + std::shared_ptr table_properties; }; // State kept for output being generated @@ -102,6 +105,14 @@ struct CompactionJob::SubcompactionState { uint64_t num_output_records; CompactionJobStats compaction_job_stats; uint64_t approx_size; + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index = 0; + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t overlapped_bytes = 0; + // A flag determine whether the key has been seen in ShouldStopBefore() + bool seen_key = false; + std::string compression_dict; SubcompactionState(Compaction* c, Slice* _start, Slice* _end, uint64_t size = 0) @@ -113,7 +124,11 @@ struct CompactionJob::SubcompactionState { total_bytes(0), num_input_records(0), num_output_records(0), - approx_size(size) { + approx_size(size), + grandparent_index(0), + overlapped_bytes(0), + seen_key(false), + compression_dict() { assert(compaction != nullptr); } @@ -132,6 +147,10 @@ struct CompactionJob::SubcompactionState { num_output_records = std::move(o.num_output_records); compaction_job_stats = std::move(o.compaction_job_stats); approx_size = std::move(o.approx_size); + grandparent_index = std::move(o.grandparent_index); + overlapped_bytes = std::move(o.overlapped_bytes); + seen_key = std::move(o.seen_key); + compression_dict = std::move(o.compression_dict); return *this; } @@ -139,6 +158,38 @@ struct CompactionJob::SubcompactionState { SubcompactionState(const SubcompactionState&) = delete; SubcompactionState& operator=(const SubcompactionState&) = delete; + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key) { + const InternalKeyComparator* icmp = + &compaction->column_family_data()->internal_comparator(); + const std::vector& grandparents = compaction->grandparents(); + + // Scan to find earliest grandparent file that contains key. + while (grandparent_index < grandparents.size() && + icmp->Compare(internal_key, + grandparents[grandparent_index]->largest.Encode()) > + 0) { + if (seen_key) { + overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize(); + } + assert(grandparent_index + 1 >= grandparents.size() || + icmp->Compare( + grandparents[grandparent_index]->largest.Encode(), + grandparents[grandparent_index + 1]->smallest.Encode()) <= 0); + grandparent_index++; + } + seen_key = true; + + if (overlapped_bytes > compaction->max_grandparent_overlap_bytes()) { + // Too much overlap for current output; start new output + overlapped_bytes = 0; + return true; + } + + return false; + } }; // Maintains state for the entire compaction @@ -210,7 +261,9 @@ CompactionJob::CompactionJob( const EnvOptions& env_options, VersionSet* versions, std::atomic* shutting_down, LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, Status* db_bg_error, std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats) @@ -228,13 +281,18 @@ CompactionJob::CompactionJob( db_directory_(db_directory), output_directory_(output_directory), stats_(stats), + db_mutex_(db_mutex), + db_bg_error_(db_bg_error), existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), table_cache_(std::move(table_cache)), event_logger_(event_logger), paranoid_file_checks_(paranoid_file_checks), measure_io_stats_(measure_io_stats) { assert(log_buffer_ != nullptr); - ThreadStatusUtil::SetColumnFamily(compact_->compaction->column_family_data()); + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + cfd->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); ReportStartedCompaction(compaction); } @@ -246,8 +304,9 @@ CompactionJob::~CompactionJob() { void CompactionJob::ReportStartedCompaction( Compaction* compaction) { - ThreadStatusUtil::SetColumnFamily( - compact_->compaction->column_family_data()); + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + cfd->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperationProperty( ThreadStatus::COMPACTION_JOB_ID, @@ -331,11 +390,6 @@ struct RangeWithSize { : range(a, b), size(s) {} }; -bool SliceCompare(const Comparator* cmp, const Slice& a, const Slice& b) { - // Returns true if a < b - return cmp->Compare(ExtractUserKey(a), ExtractUserKey(b)) < 0; -} - // Generates a histogram representing potential divisions of key ranges from // the input. It adds the starting and/or ending keys of certain input files // to the working set and then finds the approximate size of data in between @@ -344,14 +398,13 @@ bool SliceCompare(const Comparator* cmp, const Slice& a, const Slice& b) { void CompactionJob::GenSubcompactionBoundaries() { auto* c = compact_->compaction; auto* cfd = c->column_family_data(); - std::set > bounds( - std::bind(&SliceCompare, cfd->user_comparator(), std::placeholders::_1, - std::placeholders::_2)); + const Comparator* cfd_comparator = cfd->user_comparator(); + std::vector bounds; int start_lvl = c->start_level(); int out_lvl = c->output_level(); // Add the starting and/or ending key of certain input files as a potential - // boundary (because we're inserting into a set, it avoids duplicates) + // boundary for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) { int lvl = c->level(lvl_idx); if (lvl >= start_lvl && lvl <= out_lvl) { @@ -359,34 +412,44 @@ void CompactionJob::GenSubcompactionBoundaries() { size_t num_files = flevel->num_files; if (num_files == 0) { - break; + continue; } if (lvl == 0) { // For level 0 add the starting and ending key of each file since the // files may have greatly differing key ranges (not range-partitioned) for (size_t i = 0; i < num_files; i++) { - bounds.emplace(flevel->files[i].smallest_key); - bounds.emplace(flevel->files[i].largest_key); + bounds.emplace_back(flevel->files[i].smallest_key); + bounds.emplace_back(flevel->files[i].largest_key); } } else { // For all other levels add the smallest/largest key in the level to // encompass the range covered by that level - bounds.emplace(flevel->files[0].smallest_key); - bounds.emplace(flevel->files[num_files - 1].largest_key); + bounds.emplace_back(flevel->files[0].smallest_key); + bounds.emplace_back(flevel->files[num_files - 1].largest_key); if (lvl == out_lvl) { // For the last level include the starting keys of all files since // the last level is the largest and probably has the widest key // range. Since it's range partitioned, the ending key of one file // and the starting key of the next are very close (or identical). for (size_t i = 1; i < num_files; i++) { - bounds.emplace(flevel->files[i].smallest_key); + bounds.emplace_back(flevel->files[i].smallest_key); } } } } } + std::sort(bounds.begin(), bounds.end(), + [cfd_comparator] (const Slice& a, const Slice& b) -> bool { + return cfd_comparator->Compare(ExtractUserKey(a), ExtractUserKey(b)) < 0; + }); + // Remove duplicated entries from bounds + bounds.erase(std::unique(bounds.begin(), bounds.end(), + [cfd_comparator] (const Slice& a, const Slice& b) -> bool { + return cfd_comparator->Compare(ExtractUserKey(a), ExtractUserKey(b)) == 0; + }), bounds.end()); + // Combine consecutive pairs of boundaries into ranges with an approximate // size of data covered by keys in that range uint64_t sum = 0; @@ -408,9 +471,9 @@ void CompactionJob::GenSubcompactionBoundaries() { // Group the ranges into subcompactions const double min_file_fill_percent = 4.0 / 5; - uint64_t max_output_files = std::ceil( - sum / min_file_fill_percent / - cfd->GetCurrentMutableCFOptions()->MaxFileSizeForLevel(out_lvl)); + uint64_t max_output_files = static_cast( + std::ceil(sum / min_file_fill_percent / + c->mutable_cf_options()->MaxFileSizeForLevel(out_lvl))); uint64_t subcompactions = std::min({static_cast(ranges.size()), static_cast(db_options_.max_subcompactions), @@ -423,12 +486,12 @@ void CompactionJob::GenSubcompactionBoundaries() { // sizes becomes >= the expected mean size of a subcompaction sum = 0; for (size_t i = 0; i < ranges.size() - 1; i++) { + sum += ranges[i].size; if (subcompactions == 1) { // If there's only one left to schedule then it goes to the end so no // need to put an end boundary - break; + continue; } - sum += ranges[i].size; if (sum >= mean) { boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit)); sizes_.emplace_back(sum); @@ -487,6 +550,16 @@ Status CompactionJob::Run() { } } + TablePropertiesCollection tp; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.outputs) { + auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(), + output.meta.fd.GetPathId()); + tp[fn] = output.table_properties; + } + } + compact_->compaction->SetOutputTableProperties(std::move(tp)); + // Finish up all book-keeping to unify the subcompaction results AggregateStatistics(); UpdateCompactionStats(); @@ -498,18 +571,17 @@ Status CompactionJob::Run() { return status; } -Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options, - InstrumentedMutex* db_mutex) { +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_INSTALL); - db_mutex->AssertHeld(); + db_mutex_->AssertHeld(); Status status = compact_->status; ColumnFamilyData* cfd = compact_->compaction->column_family_data(); cfd->internal_stats()->AddCompactionStats( compact_->compaction->output_level(), compaction_stats_); if (status.ok()) { - status = InstallCompactionResults(mutable_cf_options, db_mutex); + status = InstallCompactionResults(mutable_cf_options); } VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); @@ -574,7 +646,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options, void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact != nullptr); - std::unique_ptr input( + std::unique_ptr input( versions_->MakeInputIterator(sub_compact->compaction)); AutoThreadOperationStageUpdater stage_updater( @@ -590,13 +662,38 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { if (measure_io_stats_) { prev_perf_level = GetPerfLevel(); SetPerfLevel(PerfLevel::kEnableTime); - prev_write_nanos = iostats_context.write_nanos; - prev_fsync_nanos = iostats_context.fsync_nanos; - prev_range_sync_nanos = iostats_context.range_sync_nanos; - prev_prepare_write_nanos = iostats_context.prepare_write_nanos; + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); } ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + const MutableCFOptions* mutable_cf_options = + sub_compact->compaction->mutable_cf_options(); + + // To build compression dictionary, we sample the first output file, assuming + // it'll reach the maximum length, and then use the dictionary for compressing + // subsequent output files. The dictionary may be less than max_dict_bytes if + // the first output file's length is less than the maximum. + const int kSampleLenShift = 6; // 2^6 = 64-byte samples + std::set sample_begin_offsets; + if (bottommost_level_ && + cfd->ioptions()->compression_opts.max_dict_bytes > 0) { + const size_t kMaxSamples = + cfd->ioptions()->compression_opts.max_dict_bytes >> kSampleLenShift; + const size_t kOutFileLen = mutable_cf_options->MaxFileSizeForLevel( + compact_->compaction->output_level()); + if (kOutFileLen != port::kMaxSizet) { + const size_t kOutFileNumSamples = kOutFileLen >> kSampleLenShift; + Random64 generator{versions_->NewFileNumber()}; + for (size_t i = 0; i < kMaxSamples; ++i) { + sample_begin_offsets.insert(generator.Uniform(kOutFileNumSamples) + << kSampleLenShift); + } + } + } + auto compaction_filter = cfd->ioptions()->compaction_filter; std::unique_ptr compaction_filter_from_factory = nullptr; if (compaction_filter == nullptr) { @@ -607,7 +704,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { MergeHelper merge( env_, cfd->user_comparator(), cfd->ioptions()->merge_operator, compaction_filter, db_options_.info_log.get(), - cfd->ioptions()->min_partial_merge_operands, + mutable_cf_options->min_partial_merge_operands, false /* internal key corruption is expected */, existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), compact_->compaction->level(), db_options_.statistics.get()); @@ -627,11 +724,18 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { Status status; sub_compact->c_iter.reset(new CompactionIterator( input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), - &existing_snapshots_, env_, false, sub_compact->compaction, - compaction_filter)); + &existing_snapshots_, earliest_write_conflict_snapshot_, env_, false, + sub_compact->compaction, compaction_filter)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); const auto& c_iter_stats = c_iter->iter_stats(); + auto sample_begin_offset_iter = sample_begin_offsets.cbegin(); + // data_begin_offset and compression_dict are only valid while generating + // dictionary from the first output file. + size_t data_begin_offset = 0; + std::string compression_dict; + compression_dict.reserve(cfd->ioptions()->compression_opts.max_dict_bytes); + // TODO(noetzli): check whether we could check !shutting_down_->... only // only occasionally (see diff D42687) while (status.ok() && !shutting_down_->load(std::memory_order_acquire) && @@ -646,7 +750,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { if (end != nullptr && cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) { break; - } else if (sub_compact->compaction->ShouldStopBefore(key) && + } else if (sub_compact->ShouldStopBefore(key) && sub_compact->builder != nullptr) { status = FinishCompactionOutputFile(input->status(), sub_compact); if (!status.ok()) { @@ -675,6 +779,55 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { key, c_iter->ikey().sequence); sub_compact->num_output_records++; + if (sub_compact->outputs.size() == 1) { // first output file + // Check if this key/value overlaps any sample intervals; if so, appends + // overlapping portions to the dictionary. + for (const auto& data_elmt : {key, value}) { + size_t data_end_offset = data_begin_offset + data_elmt.size(); + while (sample_begin_offset_iter != sample_begin_offsets.cend() && + *sample_begin_offset_iter < data_end_offset) { + size_t sample_end_offset = + *sample_begin_offset_iter + (1 << kSampleLenShift); + // Invariant: Because we advance sample iterator while processing the + // data_elmt containing the sample's last byte, the current sample + // cannot end before the current data_elmt. + assert(data_begin_offset < sample_end_offset); + + size_t data_elmt_copy_offset, data_elmt_copy_len; + if (*sample_begin_offset_iter <= data_begin_offset) { + // The sample starts before data_elmt starts, so take bytes starting + // at the beginning of data_elmt. + data_elmt_copy_offset = 0; + } else { + // data_elmt starts before the sample starts, so take bytes starting + // at the below offset into data_elmt. + data_elmt_copy_offset = + *sample_begin_offset_iter - data_begin_offset; + } + if (sample_end_offset <= data_end_offset) { + // The sample ends before data_elmt ends, so take as many bytes as + // needed. + data_elmt_copy_len = + sample_end_offset - (data_begin_offset + data_elmt_copy_offset); + } else { + // data_elmt ends before the sample ends, so take all remaining + // bytes in data_elmt. + data_elmt_copy_len = + data_end_offset - (data_begin_offset + data_elmt_copy_offset); + } + compression_dict.append(&data_elmt.data()[data_elmt_copy_offset], + data_elmt_copy_len); + if (sample_end_offset > data_end_offset) { + // Didn't finish sample. Try to finish it with the next data_elmt. + break; + } + // Next sample may require bytes from same data_elmt. + sample_begin_offset_iter++; + } + data_begin_offset = data_end_offset; + } + } + // Close output file if it is big enough // TODO(aekmekji): determine if file should be closed earlier than this // during subcompactions (i.e. if output size, estimated by input size, is @@ -683,8 +836,12 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { if (sub_compact->builder->FileSize() >= sub_compact->compaction->max_output_file_size()) { status = FinishCompactionOutputFile(input->status(), sub_compact); + if (sub_compact->outputs.size() == 1) { + // Use dictionary from first output file for compression of subsequent + // files. + sub_compact->compression_dict = std::move(compression_dict); + } } - c_iter->Next(); } @@ -717,13 +874,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { if (measure_io_stats_) { sub_compact->compaction_job_stats.file_write_nanos += - iostats_context.write_nanos - prev_write_nanos; + IOSTATS(write_nanos) - prev_write_nanos; sub_compact->compaction_job_stats.file_fsync_nanos += - iostats_context.fsync_nanos - prev_fsync_nanos; + IOSTATS(fsync_nanos) - prev_fsync_nanos; sub_compact->compaction_job_stats.file_range_sync_nanos += - iostats_context.range_sync_nanos - prev_range_sync_nanos; + IOSTATS(range_sync_nanos) - prev_range_sync_nanos; sub_compact->compaction_job_stats.file_prepare_write_nanos += - iostats_context.prepare_write_nanos - prev_prepare_write_nanos; + IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; if (prev_perf_level != PerfLevel::kEnableTime) { SetPerfLevel(prev_perf_level); } @@ -797,10 +954,11 @@ Status CompactionJob::FinishCompactionOutputFile( } sub_compact->outfile.reset(); + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + TableProperties tp; if (s.ok() && current_entries > 0) { // Verify that the table is usable - ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); - Iterator* iter = cfd->table_cache()->NewIterator( + InternalIterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), env_options_, cfd->internal_comparator(), meta->fd, nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -813,32 +971,51 @@ Status CompactionJob::FinishCompactionOutputFile( } delete iter; + + // Output to event logger and fire events. if (s.ok()) { - TableFileCreationInfo info(sub_compact->builder->GetTableProperties()); - info.db_name = dbname_; - info.cf_name = cfd->GetName(); - info.file_path = - TableFileName(cfd->ioptions()->db_paths, meta->fd.GetNumber(), - meta->fd.GetPathId()); - info.file_size = meta->fd.GetFileSize(); - info.job_id = job_id_; + tp = sub_compact->builder->GetTableProperties(); + sub_compact->current_output()->table_properties = + std::make_shared(tp); Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 " keys, %" PRIu64 " bytes%s", cfd->GetName().c_str(), job_id_, output_number, current_entries, current_bytes, meta->marked_for_compaction ? " (need compaction)" : ""); - EventHelpers::LogAndNotifyTableFileCreation( - event_logger_, cfd->ioptions()->listeners, meta->fd, info); } } + std::string fname = TableFileName(db_options_.db_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, + job_id_, meta->fd, tp, TableFileCreationReason::kCompaction, s); + + // Report new file to SstFileManagerImpl + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm && meta->fd.GetPathId() == 0) { + auto fn = TableFileName(cfd->ioptions()->db_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); + sfm->OnAddFile(fn); + if (sfm->IsMaxAllowedSpaceReached()) { + InstrumentedMutexLock l(db_mutex_); + if (db_bg_error_->ok()) { + s = Status::IOError("Max allowed space was reached"); + *db_bg_error_ = s; + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached"); + } + } + } + sub_compact->builder.reset(); return s; } Status CompactionJob::InstallCompactionResults( - const MutableCFOptions& mutable_cf_options, InstrumentedMutex* db_mutex) { - db_mutex->AssertHeld(); + const MutableCFOptions& mutable_cf_options) { + db_mutex_->AssertHeld(); auto* compaction = compact_->compaction; // paranoia: verify that the files that we started with @@ -873,7 +1050,7 @@ Status CompactionJob::InstallCompactionResults( } return versions_->LogAndApply(compaction->column_family_data(), mutable_cf_options, compaction->edit(), - db_mutex, db_directory_); + db_mutex_, db_directory_); } void CompactionJob::RecordCompactionIOStats() { @@ -893,11 +1070,18 @@ Status CompactionJob::OpenCompactionOutputFile( assert(sub_compact->builder == nullptr); // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); - // Make the output file - unique_ptr writable_file; std::string fname = TableFileName(db_options_.db_paths, file_number, sub_compact->compaction->output_path_id()); - Status s = env_->NewWritableFile(fname, &writable_file, env_options_); + // Fire events. + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, + TableFileCreationReason::kCompaction); +#endif // !ROCKSDB_LITE + // Make the output file + unique_ptr writable_file; + Status s = NewWritableFile(env_, fname, &writable_file, env_options_); if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 @@ -905,8 +1089,13 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->compaction->column_family_data()->GetName().c_str(), job_id_, file_number, s.ToString().c_str()); LogFlush(db_options_.info_log); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), + fname, job_id_, FileDescriptor(), TableProperties(), + TableFileCreationReason::kCompaction, s); return s; } + SubcompactionState::Output out; out.meta.fd = FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0); @@ -919,7 +1108,6 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->outfile.reset( new WritableFileWriter(std::move(writable_file), env_options_)); - ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); // If the Column family flag is to only optimize filters for hits, // we can skip creating filters if this is the bottommost_level where // data is going to be found @@ -927,9 +1115,10 @@ Status CompactionJob::OpenCompactionOutputFile( cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; sub_compact->builder.reset(NewTableBuilder( *cfd->ioptions(), cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), sub_compact->outfile.get(), - sub_compact->compaction->output_compression(), - cfd->ioptions()->compression_opts, skip_filters)); + cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), + sub_compact->outfile.get(), sub_compact->compaction->output_compression(), + cfd->ioptions()->compression_opts, &sub_compact->compression_dict, + skip_filters)); LogFlush(db_options_.info_log); return s; } diff --git a/external/rocksdb/db/compaction_job.h b/external/rocksdb/db/compaction_job.h index 1054fecc97..b2a592ea9d 100644 --- a/external/rocksdb/db/compaction_job.h +++ b/external/rocksdb/db/compaction_job.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -35,9 +35,9 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/event_logger.h" -#include "util/scoped_arena_iterator.h" #include "util/stop_watch.h" #include "util/thread_local.h" @@ -56,8 +56,10 @@ class CompactionJob { const EnvOptions& env_options, VersionSet* versions, std::atomic* shutting_down, LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory, - Statistics* stats, + Statistics* stats, InstrumentedMutex* db_mutex, + Status* db_bg_error, std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, @@ -76,8 +78,7 @@ class CompactionJob { Status Run(); // REQUIRED: mutex held - Status Install(const MutableCFOptions& mutable_cf_options, - InstrumentedMutex* db_mutex); + Status Install(const MutableCFOptions& mutable_cf_options); private: struct SubcompactionState; @@ -94,8 +95,7 @@ class CompactionJob { Status FinishCompactionOutputFile(const Status& input_status, SubcompactionState* sub_compact); - Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options, - InstrumentedMutex* db_mutex); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); void RecordCompactionIOStats(); Status OpenCompactionOutputFile(SubcompactionState* sub_compact); void CleanupCompaction(); @@ -122,6 +122,7 @@ class CompactionJob { const std::string& dbname_; const DBOptions& db_options_; const EnvOptions& env_options_; + Env* env_; VersionSet* versions_; std::atomic* shutting_down_; @@ -129,11 +130,19 @@ class CompactionJob { Directory* db_directory_; Directory* output_directory_; Statistics* stats_; + InstrumentedMutex* db_mutex_; + Status* db_bg_error_; // If there were two snapshots with seq numbers s1 and // s2 and s1 < s2, and if we find two instances of a key k1 then lies // entirely within s1 and s2, then the earlier version of k1 can be safely // deleted because that version is not visible in any snapshot. std::vector existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot_; + std::shared_ptr table_cache_; EventLogger* event_logger_; diff --git a/external/rocksdb/db/compaction_job_stats_test.cc b/external/rocksdb/db/compaction_job_stats_test.cc index 8641c8a843..a05b0ba64a 100644 --- a/external/rocksdb/db/compaction_job_stats_test.cc +++ b/external/rocksdb/db/compaction_job_stats_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -27,6 +27,7 @@ #include "db/job_context.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "memtable/hash_linklist_rep.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -47,14 +48,13 @@ #include "table/block_based_table_factory.h" #include "table/mock_table.h" #include "table/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" #include "util/compression.h" #include "util/hash.h" -#include "util/hash_linklist_rep.h" #include "util/logging.h" #include "util/mock_env.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" -#include "util/scoped_arena_iterator.h" #include "util/statistics.h" #include "util/string_util.h" #include "util/sync_point.h" @@ -64,7 +64,7 @@ #include "util/xfunc.h" #include "utilities/merge_operators.h" -#if !defined(IOS_CROSS_COMPILE) && (!defined(NDEBUG) || !defined(OS_WIN)) +#if !defined(IOS_CROSS_COMPILE) #ifndef ROCKSDB_LITE namespace rocksdb { @@ -551,8 +551,9 @@ uint64_t EstimatedFileSize( const size_t kFooterSize = 512; uint64_t data_size = + static_cast( num_records * (key_size + value_size * compression_ratio + - kPerKeyOverhead); + kPerKeyOverhead)); return data_size + kFooterSize + num_records * bloom_bits_per_key / 8 // filter block @@ -623,7 +624,10 @@ CompressionType GetAnyCompression() { return kBZip2Compression; } else if (LZ4_Supported()) { return kLZ4Compression; + } else if (XPRESS_Supported()) { + return kXpressCompression; } + return kNoCompression; } @@ -659,7 +663,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { options.max_subcompactions = max_subcompactions_; options.bytes_per_sync = 512 * 1024; - options.compaction_measure_io_stats = true; + options.report_bg_io_stats = true; for (int test = 0; test < 2; ++test) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); diff --git a/external/rocksdb/db/compaction_job_test.cc b/external/rocksdb/db/compaction_job_test.cc index b1a8909ef3..e218bd32a0 100644 --- a/external/rocksdb/db/compaction_job_test.cc +++ b/external/rocksdb/db/compaction_job_test.cc @@ -1,20 +1,22 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE + #include #include #include #include -#include "db/compaction_job.h" #include "db/column_family.h" +#include "db/compaction_job.h" #include "db/version_set.h" -#include "db/writebuffer.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" #include "util/file_reader_writer.h" #include "util/string_util.h" @@ -68,9 +70,9 @@ class CompactionJobTest : public testing::Test { dbname_(test::TmpDir() + "/compaction_job_test"), mutable_cf_options_(Options(), ImmutableCFOptions(Options())), table_cache_(NewLRUCache(50000, 16)), - write_buffer_(db_options_.db_write_buffer_size), + write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_, + table_cache_.get(), &write_buffer_manager_, &write_controller_)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()) { @@ -160,9 +162,12 @@ class CompactionJobTest : public testing::Test { auto key = ToString(i * kMatchingKeys + k); auto value = ToString(i * kKeysPerFile + k); InternalKey internal_key(key, ++sequence_number, kTypeValue); + // This is how the key will look like once it's written in bottommost // file - InternalKey bottommost_internal_key(key, 0, kTypeValue); + InternalKey bottommost_internal_key( + key, (key == "9999") ? sequence_number : 0, kTypeValue); + if (corrupt_id(k)) { test::CorruptKeyType(&internal_key); test::CorruptKeyType(&bottommost_internal_key); @@ -196,7 +201,7 @@ class CompactionJobTest : public testing::Test { unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options_)); { - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); @@ -215,9 +220,11 @@ class CompactionJobTest : public testing::Test { cfd_ = versions_->GetColumnFamilySet()->GetDefault(); } - void RunCompaction(const std::vector>& input_files, - const stl_wrappers::KVMap& expected_results, - const std::vector& snapshots = {}) { + void RunCompaction( + const std::vector>& input_files, + const stl_wrappers::KVMap& expected_results, + const std::vector& snapshots = {}, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { auto cfd = versions_->GetColumnFamilySet()->GetDefault(); size_t num_input_files = 0; @@ -241,11 +248,11 @@ class CompactionJobTest : public testing::Test { LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); mutex_.Lock(); EventLogger event_logger(db_options_.info_log.get()); - CompactionJob compaction_job(0, &compaction, db_options_, env_options_, - versions_.get(), &shutting_down_, &log_buffer, - nullptr, nullptr, nullptr, snapshots, - table_cache_, &event_logger, false, false, - dbname_, &compaction_job_stats_); + CompactionJob compaction_job( + 0, &compaction, db_options_, env_options_, versions_.get(), + &shutting_down_, &log_buffer, nullptr, nullptr, nullptr, &mutex_, + &bg_error_, snapshots, earliest_write_conflict_snapshot, table_cache_, + &event_logger, false, false, dbname_, &compaction_job_stats_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); @@ -255,8 +262,7 @@ class CompactionJobTest : public testing::Test { s = compaction_job.Run(); ASSERT_OK(s); mutex_.Lock(); - ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions(), - &mutex_)); + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); mutex_.Unlock(); if (expected_results.size() == 0) { @@ -279,7 +285,7 @@ class CompactionJobTest : public testing::Test { WriteController write_controller_; DBOptions db_options_; ColumnFamilyOptions cf_options_; - WriteBuffer write_buffer_; + WriteBufferManager write_buffer_manager_; std::unique_ptr versions_; InstrumentedMutex mutex_; std::atomic shutting_down_; @@ -288,6 +294,7 @@ class CompactionJobTest : public testing::Test { ColumnFamilyData* cfd_; std::unique_ptr compaction_filter_; std::shared_ptr merge_op_; + Status bg_error_; }; TEST_F(CompactionJobTest, Simple) { @@ -344,7 +351,7 @@ TEST_F(CompactionJobTest, SimpleOverwrite) { auto expected_results = mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"}, - {KeyStr("b", 0U, kTypeValue), "val3"}}); + {KeyStr("b", 4U, kTypeValue), "val3"}}); SetLastSequence(4U); auto files = cfd_->current()->storage_info()->LevelFiles(0); @@ -397,7 +404,7 @@ TEST_F(CompactionJobTest, SimpleMerge) { auto expected_results = mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, - {KeyStr("b", 0U, kTypeValue), "1,2"}}); + {KeyStr("b", 2U, kTypeValue), "1,2"}}); SetLastSequence(5U); auto files = cfd_->current()->storage_info()->LevelFiles(0); @@ -557,34 +564,144 @@ TEST_F(CompactionJobTest, SimpleSingleDelete) { TEST_F(CompactionJobTest, SingleDeleteSnapshots) { NewDB(); - auto file1 = mock::MakeMockFile({{KeyStr("A", 12U, kTypeSingleDeletion), ""}, - {KeyStr("a", 12U, kTypeSingleDeletion), ""}, - {KeyStr("b", 21U, kTypeSingleDeletion), ""}, - {KeyStr("c", 22U, kTypeSingleDeletion), ""}, - {KeyStr("d", 9U, kTypeSingleDeletion), ""}}); + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("d", 9U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 9U, kTypeSingleDeletion), ""}, + {KeyStr("k", 12U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("l", 3U, kTypeSingleDeletion), ""}, + {KeyStr("l", 2U, kTypeSingleDeletion), ""}, + }); AddMockFile(file1); - auto file2 = mock::MakeMockFile({{KeyStr("0", 2U, kTypeSingleDeletion), ""}, - {KeyStr("a", 11U, kTypeValue), "val1"}, - {KeyStr("b", 11U, kTypeValue), "val2"}, - {KeyStr("c", 21U, kTypeValue), "val3"}, - {KeyStr("d", 8U, kTypeValue), "val4"}, - {KeyStr("e", 2U, kTypeSingleDeletion), ""}}); + auto file2 = mock::MakeMockFile({ + {KeyStr("0", 2U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), "val1"}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 21U, kTypeValue), "val3"}, + {KeyStr("d", 8U, kTypeValue), "val4"}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("h", 2U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); AddMockFile(file2); - auto file3 = mock::MakeMockFile({{KeyStr("A", 1U, kTypeValue), "val"}, - {KeyStr("e", 1U, kTypeValue), "val"}}); + auto file3 = mock::MakeMockFile({ + {KeyStr("A", 1U, kTypeValue), "val"}, + {KeyStr("e", 1U, kTypeValue), "val"}, + }); AddMockFile(file3, 2); - auto expected_results = - mock::MakeMockFile({{KeyStr("A", 12U, kTypeSingleDeletion), ""}, - {KeyStr("b", 21U, kTypeSingleDeletion), ""}, - {KeyStr("b", 11U, kTypeValue), "val2"}, - {KeyStr("e", 2U, kTypeSingleDeletion), ""}}); + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("c", 21U, kTypeValue), ""}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); SetLastSequence(22U); auto files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, expected_results, {10U, 20U}); + RunCompaction({files}, expected_results, {10U, 20U}, 10U); +} + +TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) { + NewDB(); + + // Test multiple snapshots where the earliest snapshot is not a + // write-conflic-snapshot. + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), "val"}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), "val"}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), "val"}, + {KeyStr("G", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 23U, kTypeValue), "val2"}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("H", 24U, kTypeSingleDeletion), ""}, + {KeyStr("H", 23U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), "val2"}, + {KeyStr("I", 33U, kTypeSingleDeletion), ""}, + {KeyStr("I", 32U, kTypeValue), "val3"}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeSingleDeletion), ""}, + {KeyStr("C", 13U, kTypeValue), "val"}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val3"}, + {KeyStr("H", 14U, kTypeSingleDeletion), ""}, + {KeyStr("H", 13U, kTypeValue), "val2"}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val5"}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), ""}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), ""}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), ""}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), ""}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + + SetLastSequence(24U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U, 20U, 30U}, 20U); } TEST_F(CompactionJobTest, SingleDeleteZeroSeq) { @@ -602,7 +719,7 @@ TEST_F(CompactionJobTest, SingleDeleteZeroSeq) { AddMockFile(file2); auto expected_results = mock::MakeMockFile({ - {KeyStr("dummy", 0U, kTypeValue), "val2"}, + {KeyStr("dummy", 5U, kTypeValue), "val2"}, }); SetLastSequence(22U); @@ -614,9 +731,22 @@ TEST_F(CompactionJobTest, MultiSingleDelete) { // Tests three scenarios involving multiple single delete/put pairs: // // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel - // B: Put SDel Put SDel -> (Removed) + // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot // C: SDel Put SDel Snapshot Put -> Snapshot Put - // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot + // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel + // E: Put SDel Snapshot Put SDel -> Snapshot SDel + // F: Put SDel Put Sdel Snapshot -> removed + // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel + // H: (Put) Put SDel Put Sdel Snapshot -> Removed + // I: (Put) Snapshot Put SDel Put SDel -> SDel + // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put + // -> Snapshot Put + // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot Put Snapshot SDel + // L: SDel Put Del Put SDel Snapshot Del Put Del SDel Put SDel + // -> Snapshot SDel + // M: (Put) SDel Put Del Put SDel Snapshot Put Del SDel Put SDel Del + // -> SDel Snapshot Del NewDB(); auto file1 = mock::MakeMockFile({ @@ -628,6 +758,34 @@ TEST_F(CompactionJobTest, MultiSingleDelete) { {KeyStr("C", 14U, kTypeValue), "val3"}, {KeyStr("D", 12U, kTypeSingleDeletion), ""}, {KeyStr("D", 11U, kTypeValue), "val4"}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val"}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val"}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + {KeyStr("J", 13U, kTypeSingleDeletion), ""}, + {KeyStr("J", 12U, kTypeValue), "val"}, + {KeyStr("J", 11U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), "val1"}, + {KeyStr("K", 14U, kTypeSingleDeletion), ""}, + {KeyStr("K", 13U, kTypeSingleDeletion), ""}, + {KeyStr("K", 12U, kTypeValue), "val2"}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), "val"}, + {KeyStr("L", 14U, kTypeSingleDeletion), ""}, + {KeyStr("L", 13U, kTypeDeletion), ""}, + {KeyStr("L", 12U, kTypeValue), "val"}, + {KeyStr("L", 11U, kTypeDeletion), ""}, + {KeyStr("M", 16U, kTypeDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), "val"}, + {KeyStr("M", 13U, kTypeSingleDeletion), ""}, + {KeyStr("M", 12U, kTypeDeletion), ""}, + {KeyStr("M", 11U, kTypeValue), "val"}, }); AddMockFile(file1); @@ -639,24 +797,87 @@ TEST_F(CompactionJobTest, MultiSingleDelete) { {KeyStr("C", 9U, kTypeValue), "val6"}, {KeyStr("C", 8U, kTypeSingleDeletion), ""}, {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), "val"}, + {KeyStr("E", 5U, kTypeSingleDeletion), ""}, + {KeyStr("E", 4U, kTypeValue), "val"}, + {KeyStr("F", 6U, kTypeSingleDeletion), ""}, + {KeyStr("F", 5U, kTypeValue), "val"}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("H", 6U, kTypeSingleDeletion), ""}, + {KeyStr("H", 5U, kTypeValue), "val"}, + {KeyStr("H", 4U, kTypeSingleDeletion), ""}, + {KeyStr("H", 3U, kTypeValue), "val"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val"}, + {KeyStr("J", 6U, kTypeSingleDeletion), ""}, + {KeyStr("J", 5U, kTypeSingleDeletion), ""}, + {KeyStr("J", 4U, kTypeValue), "val"}, + {KeyStr("J", 3U, kTypeSingleDeletion), ""}, + {KeyStr("J", 2U, kTypeValue), "val"}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("K", 7U, kTypeValue), "val4"}, + {KeyStr("K", 6U, kTypeSingleDeletion), ""}, + {KeyStr("K", 5U, kTypeValue), "val5"}, + {KeyStr("K", 2U, kTypeSingleDeletion), ""}, + {KeyStr("K", 1U, kTypeSingleDeletion), ""}, + {KeyStr("L", 5U, kTypeSingleDeletion), ""}, + {KeyStr("L", 4U, kTypeValue), "val"}, + {KeyStr("L", 3U, kTypeDeletion), ""}, + {KeyStr("L", 2U, kTypeValue), "val"}, + {KeyStr("L", 1U, kTypeSingleDeletion), ""}, + {KeyStr("M", 10U, kTypeSingleDeletion), ""}, + {KeyStr("M", 7U, kTypeValue), "val"}, + {KeyStr("M", 5U, kTypeDeletion), ""}, + {KeyStr("M", 4U, kTypeValue), "val"}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}, }); AddMockFile(file2); auto file3 = mock::MakeMockFile({ - {KeyStr("D", 11U, kTypeValue), "val"}, + {KeyStr("D", 1U, kTypeValue), "val"}, + {KeyStr("H", 1U, kTypeValue), "val"}, + {KeyStr("I", 2U, kTypeValue), "val"}, }); AddMockFile(file3, 2); - auto expected_results = mock::MakeMockFile({ - {KeyStr("A", 12U, kTypeSingleDeletion), ""}, - {KeyStr("A", 10U, kTypeValue), "val"}, - {KeyStr("C", 14U, kTypeValue), "val3"}, - {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + auto file4 = mock::MakeMockFile({ + {KeyStr("M", 1U, kTypeValue), "val"}, }); + AddMockFile(file4, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), ""}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), ""}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), ""}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), ""}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), ""}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), ""}, + {KeyStr("M", 16U, kTypeDeletion), ""}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}}); SetLastSequence(22U); auto files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, expected_results, {10U}); + RunCompaction({files}, expected_results, {10U}, 10U); } // This test documents the behavior where a corrupt key follows a deletion or a @@ -682,7 +903,7 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) { mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"}, {test::KeyStr("a", 0U, kTypeValue, true), "val"}, {test::KeyStr("b", 0U, kTypeValue, true), "val"}, - {test::KeyStr("c", 0U, kTypeValue), "val2"}}); + {test::KeyStr("c", 1U, kTypeValue), "val2"}}); SetLastSequence(6U); auto files = cfd_->current()->storage_info()->LevelFiles(0); @@ -695,3 +916,14 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } + +#else +#include + +int main(int argc, char** argv) { + fprintf(stderr, + "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/external/rocksdb/db/compaction_picker.cc b/external/rocksdb/db/compaction_picker.cc index 27935085db..3d3e3e6682 100644 --- a/external/rocksdb/db/compaction_picker.cc +++ b/external/rocksdb/db/compaction_picker.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -70,7 +70,8 @@ struct UserKeyComparator { }; typedef std::priority_queue, - UserKeyComparator> SmallestKeyHeap; + UserKeyComparator> + SmallestKeyHeap; // This function creates the heap that is used to find if the files are // overlapping during universal compaction when the allow_trivial_move @@ -109,13 +110,22 @@ SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { // matter what the values of the other two parameters are. // Otherwise, the compression type is determined based on options and level. CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, int level, int base_level, const bool enable_compression) { if (!enable_compression) { // disable compression return kNoCompression; } - // If the use has specified a different compression level for each level, + + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use it. + if (ioptions.bottommost_compression != kDisableCompressionOption && + level > base_level && level >= (vstorage->num_non_empty_levels() - 1)) { + return ioptions.bottommost_compression; + } + // If the user has specified a different compression level for each level, // then pick the compression for that level. if (!ioptions.compression_per_level.empty()) { assert(level == 0 || level >= base_level); @@ -129,7 +139,7 @@ CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, // specified compression levels, use the last value. return ioptions.compression_per_level[std::max(0, std::min(idx, n))]; } else { - return ioptions.compression; + return mutable_cf_options.compression; } } @@ -141,7 +151,8 @@ CompactionPicker::~CompactionPicker() {} // Delete this compaction from the list of running compactions. void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { - if (c->start_level() == 0) { + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { level0_compactions_in_progress_.erase(c); } if (!status.ok()) { @@ -189,10 +200,9 @@ void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, InternalKey smallest1, smallest2, largest1, largest2; GetRange(inputs1, &smallest1, &largest1); GetRange(inputs2, &smallest2, &largest2); - *smallest = icmp_->Compare(smallest1, smallest2) < 0 ? - smallest1 : smallest2; - *largest = icmp_->Compare(largest1, largest2) < 0 ? - largest2 : largest1; + *smallest = + icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2; + *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1; } } @@ -243,7 +253,7 @@ bool CompactionPicker::ExpandWhileOverlapping(const std::string& cf_name, // Returns true if any one of specified files are being compacted bool CompactionPicker::FilesInCompaction( const std::vector& files) { - for (unsigned int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { if (files[i]->being_compacted) { return true; } @@ -255,22 +265,35 @@ Compaction* CompactionPicker::FormCompaction( const CompactionOptions& compact_options, const std::vector& input_files, int output_level, VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, - uint32_t output_path_id) const { + uint32_t output_path_id) { uint64_t max_grandparent_overlap_bytes = - output_level + 1 < vstorage->num_levels() ? - mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) : - std::numeric_limits::max(); + output_level + 1 < vstorage->num_levels() + ? mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) + : std::numeric_limits::max(); assert(input_files.size()); - return new Compaction( + + // TODO(rven ): we might be able to run concurrent level 0 compaction + // if the key ranges of the two compactions do not overlap, but for now + // we do not allow it. + if ((input_files[0].level == 0) && !level0_compactions_in_progress_.empty()) { + return nullptr; + } + auto c = new Compaction( vstorage, mutable_cf_options, input_files, output_level, compact_options.output_file_size_limit, max_grandparent_overlap_bytes, output_path_id, compact_options.compression, /* grandparents */ {}, true); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + if ((c != nullptr) && (input_files[0].level == 0)) { + level0_compactions_in_progress_.insert(c); + } + return c; } Status CompactionPicker::GetCompactionInputsFromFileNumbers( std::vector* input_files, - std::unordered_set* input_set, - const VersionStorageInfo* vstorage, + std::unordered_set* input_set, const VersionStorageInfo* vstorage, const CompactionOptions& compact_options) const { if (input_set->size() == 0U) { return Status::InvalidArgument( @@ -308,8 +331,8 @@ Status CompactionPicker::GetCompactionInputsFromFileNumbers( return Status::InvalidArgument(message); } - for (int level = first_non_empty_level; - level <= last_non_empty_level; ++level) { + for (int level = first_non_empty_level; level <= last_non_empty_level; + ++level) { matched_input_files[level].level = level; input_files->emplace_back(std::move(matched_input_files[level])); } @@ -317,8 +340,6 @@ Status CompactionPicker::GetCompactionInputsFromFileNumbers( return Status::OK(); } - - // Returns true if any one of the parent files are being compacted bool CompactionPicker::RangeInCompaction(VersionStorageInfo* vstorage, const InternalKey* smallest, @@ -369,6 +390,9 @@ bool CompactionPicker::SetupOtherInputs( vstorage->GetOverlappingInputs(output_level, &smallest, &largest, &output_level_inputs->files, *parent_index, parent_index); + if (!output_level_inputs->empty()) { + ExpandWhileOverlapping(cf_name, vstorage, output_level_inputs); + } if (FilesInCompaction(output_level_inputs->files)) { return false; @@ -441,7 +465,7 @@ Compaction* CompactionPicker::CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, int input_level, int output_level, uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) { + InternalKey** compaction_end, bool* manual_conflict) { // CompactionPickerFIFO has its own implementation of compact range assert(ioptions_.compaction_style != kCompactionStyleFIFO); @@ -467,6 +491,12 @@ Compaction* CompactionPicker::CompactRange( return nullptr; } + if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) { + *manual_conflict = true; + // Only one level 0 compaction allowed + return nullptr; + } + std::vector inputs(vstorage->num_levels() - start_level); for (int level = start_level; level < vstorage->num_levels(); level++) { @@ -475,13 +505,22 @@ Compaction* CompactionPicker::CompactRange( for (FileMetaData* f : vstorage->LevelFiles(level)) { files.push_back(f); } + if (FilesInCompaction(files)) { + *manual_conflict = true; + return nullptr; + } } - return new Compaction( + Compaction* c = new Compaction( vstorage, mutable_cf_options, std::move(inputs), output_level, mutable_cf_options.MaxFileSizeForLevel(output_level), /* max_grandparent_overlap_bytes */ LLONG_MAX, output_path_id, - GetCompressionType(ioptions_, output_level, 1), + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + output_level, 1), /* grandparents */ {}, /* is manual */ true); + if (start_level == 0) { + level0_compactions_in_progress_.insert(c); + } + return c; } CompactionInputFiles inputs; @@ -500,13 +539,20 @@ Compaction* CompactionPicker::CompactRange( return nullptr; } + if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) { + // Only one level 0 compaction allowed + TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict"); + *manual_conflict = true; + return nullptr; + } + // Avoid compacting too much in one shot in case the range is large. // But we cannot do this for level-0 since level-0 files can overlap // and we must not pick one file and drop another older file if the // two files overlap. if (input_level > 0) { const uint64_t limit = mutable_cf_options.MaxFileSizeForLevel(input_level) * - mutable_cf_options.source_compaction_factor; + mutable_cf_options.source_compaction_factor; uint64_t total = 0; for (size_t i = 0; i + 1 < inputs.size(); ++i) { uint64_t s = inputs[i]->compensated_file_size; @@ -522,9 +568,10 @@ Compaction* CompactionPicker::CompactRange( assert(output_path_id < static_cast(ioptions_.db_paths.size())); if (ExpandWhileOverlapping(cf_name, vstorage, &inputs) == false) { - // manual compaction is currently single-threaded, so it should never + // manual compaction is now multi-threaded, so it can // happen that ExpandWhileOverlapping fails - assert(false); + // we handle it higher in RunManualCompaction + *manual_conflict = true; return nullptr; } @@ -543,9 +590,10 @@ Compaction* CompactionPicker::CompactRange( int parent_index = -1; if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, &output_level_inputs, &parent_index, -1)) { - // manual compaction is currently single-threaded, so it should never + // manual compaction is now multi-threaded, so it can // happen that SetupOtherInputs fails - assert(false); + // we handle it higher in RunManualCompaction + *manual_conflict = true; return nullptr; } } @@ -554,6 +602,12 @@ Compaction* CompactionPicker::CompactRange( if (!output_level_inputs.empty()) { compaction_inputs.push_back(output_level_inputs); } + for (size_t i = 0; i < compaction_inputs.size(); i++) { + if (FilesInCompaction(compaction_inputs[i].files)) { + *manual_conflict = true; + return nullptr; + } + } std::vector grandparents; GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); @@ -562,19 +616,29 @@ Compaction* CompactionPicker::CompactRange( mutable_cf_options.MaxFileSizeForLevel(output_level), mutable_cf_options.MaxGrandParentOverlapBytes(input_level), output_path_id, - GetCompressionType(ioptions_, output_level, vstorage->base_level()), + GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, + vstorage->base_level()), std::move(grandparents), /* is manual compaction */ true); TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); + if (input_level == 0) { + level0_compactions_in_progress_.insert(compaction); + } + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage->ComputeCompactionScore(mutable_cf_options); + return compaction; } #ifndef ROCKSDB_LITE namespace { // Test whether two files have overlapping key-ranges. -bool HaveOverlappingKeyRanges( - const Comparator* c, - const SstFileMetaData& a, const SstFileMetaData& b) { +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { if (c->Compare(a.smallestkey, b.largestkey) <= 0) { // b.smallestkey <= a.smallestkey <= b.largestkey @@ -598,9 +662,8 @@ bool HaveOverlappingKeyRanges( } // namespace Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( - std::unordered_set* input_files, - const ColumnFamilyMetaData& cf_meta, - const int output_level) const { + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { auto& levels = cf_meta.levels; auto comparator = icmp_->user_comparator(); @@ -653,18 +716,17 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( // has overlapping key-range with other non-compaction input // files in the same level. while (first_included > 0) { - if (comparator->Compare( - current_files[first_included - 1].largestkey, - current_files[first_included].smallestkey) < 0) { + if (comparator->Compare(current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < + 0) { break; } first_included--; } while (last_included < static_cast(current_files.size()) - 1) { - if (comparator->Compare( - current_files[last_included + 1].smallestkey, - current_files[last_included].largestkey) > 0) { + if (comparator->Compare(current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { break; } last_included++; @@ -674,33 +736,31 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( // include all files between the first and the last compaction input files. for (int f = first_included; f <= last_included; ++f) { if (current_files[f].being_compacted) { - return Status::Aborted( - "Necessary compaction input file " + current_files[f].name + - " is currently being compacted."); + return Status::Aborted("Necessary compaction input file " + + current_files[f].name + + " is currently being compacted."); } - input_files->insert( - TableFileNameToNumber(current_files[f].name)); + input_files->insert(TableFileNameToNumber(current_files[f].name)); } // update smallest and largest key if (l == 0) { for (int f = first_included; f <= last_included; ++f) { - if (comparator->Compare( - smallestkey, current_files[f].smallestkey) > 0) { + if (comparator->Compare(smallestkey, current_files[f].smallestkey) > + 0) { smallestkey = current_files[f].smallestkey; } - if (comparator->Compare( - largestkey, current_files[f].largestkey) < 0) { + if (comparator->Compare(largestkey, current_files[f].largestkey) < 0) { largestkey = current_files[f].largestkey; } } } else { - if (comparator->Compare( - smallestkey, current_files[first_included].smallestkey) > 0) { + if (comparator->Compare(smallestkey, + current_files[first_included].smallestkey) > 0) { smallestkey = current_files[first_included].smallestkey; } - if (comparator->Compare( - largestkey, current_files[last_included].largestkey) < 0) { + if (comparator->Compare(largestkey, + current_files[last_included].largestkey) < 0) { largestkey = current_files[last_included].largestkey; } } @@ -717,16 +777,15 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( // time and not by key for (int m = std::max(l, 1); m <= output_level; ++m) { for (auto& next_lv_file : levels[m].files) { - if (HaveOverlappingKeyRanges( - comparator, aggregated_file_meta, next_lv_file)) { + if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta, + next_lv_file)) { if (next_lv_file.being_compacted) { return Status::Aborted( "File " + next_lv_file.name + " that has overlapping key range with one of the compaction " " input file is currently being compacted."); } - input_files->insert( - TableFileNameToNumber(next_lv_file.name)); + input_files->insert(TableFileNameToNumber(next_lv_file.name)); } } } @@ -736,28 +795,25 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( Status CompactionPicker::SanitizeCompactionInputFiles( std::unordered_set* input_files, - const ColumnFamilyMetaData& cf_meta, - const int output_level) const { + const ColumnFamilyMetaData& cf_meta, const int output_level) const { assert(static_cast(cf_meta.levels.size()) - 1 == cf_meta.levels[cf_meta.levels.size() - 1].level); if (output_level >= static_cast(cf_meta.levels.size())) { return Status::InvalidArgument( "Output level for column family " + cf_meta.name + " must between [0, " + - ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + - "]."); + ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + "]."); } if (output_level > MaxOutputLevel()) { return Status::InvalidArgument( "Exceed the maximum output level defined by " "the current compaction algorithm --- " + - ToString(MaxOutputLevel())); + ToString(MaxOutputLevel())); } if (output_level < 0) { - return Status::InvalidArgument( - "Output level cannot be negative."); + return Status::InvalidArgument("Output level cannot be negative."); } if (input_files->size() == 0) { @@ -765,8 +821,8 @@ Status CompactionPicker::SanitizeCompactionInputFiles( "A compaction must contain at least one file."); } - Status s = SanitizeCompactionInputFilesForAllLevels( - input_files, cf_meta, output_level); + Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta, + output_level); if (!s.ok()) { return s; @@ -780,10 +836,9 @@ Status CompactionPicker::SanitizeCompactionInputFiles( for (auto file_meta : level_meta.files) { if (file_num == TableFileNameToNumber(file_meta.name)) { if (file_meta.being_compacted) { - return Status::Aborted( - "Specified compaction input file " + - MakeTableFileName("", file_num) + - " is already being compacted."); + return Status::Aborted("Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); } found = true; break; @@ -795,8 +850,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles( } if (!found) { return Status::InvalidArgument( - "Specified compaction input file " + - MakeTableFileName("", file_num) + + "Specified compaction input file " + MakeTableFileName("", file_num) + " does not exist in column family " + cf_meta.name + "."); } } @@ -805,8 +859,8 @@ Status CompactionPicker::SanitizeCompactionInputFiles( } #endif // !ROCKSDB_LITE -bool LevelCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage) - const { +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { if (!vstorage->FilesMarkedForCompaction().empty()) { return true; } @@ -870,22 +924,40 @@ Compaction* LevelCompactionPicker::PickCompaction( int base_index = -1; CompactionInputFiles inputs; double score = 0; + CompactionReason compaction_reason = CompactionReason::kUnknown; // Find the compactions by size on all levels. + bool skipped_l0 = false; for (int i = 0; i < NumberLevels() - 1; i++) { score = vstorage->CompactionScore(i); level = vstorage->CompactionScoreLevel(i); assert(i == 0 || score <= vstorage->CompactionScore(i - 1)); if (score >= 1) { + if (skipped_l0 && level == vstorage->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } output_level = (level == 0) ? vstorage->base_level() : level + 1; if (PickCompactionBySize(vstorage, level, output_level, &inputs, &parent_index, &base_index) && ExpandWhileOverlapping(cf_name, vstorage, &inputs)) { // found the compaction! + if (level == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason = CompactionReason::kLevelMaxLevelSize; + } break; } else { // didn't find the compaction, clear the inputs inputs.clear(); + if (level == 0) { + skipped_l0 = true; + } } } } @@ -898,6 +970,9 @@ Compaction* LevelCompactionPicker::PickCompaction( parent_index = base_index = -1; PickFilesMarkedForCompactionExperimental(cf_name, vstorage, &inputs, &level, &output_level); + if (!inputs.empty()) { + compaction_reason = CompactionReason::kFilesMarkedForCompaction; + } } if (inputs.empty()) { return nullptr; @@ -931,7 +1006,7 @@ Compaction* LevelCompactionPicker::PickCompaction( CompactionInputFiles output_level_inputs; output_level_inputs.level = output_level; if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, - &output_level_inputs, &parent_index, base_index)) { + &output_level_inputs, &parent_index, base_index)) { return nullptr; } @@ -947,8 +1022,10 @@ Compaction* LevelCompactionPicker::PickCompaction( mutable_cf_options.MaxFileSizeForLevel(output_level), mutable_cf_options.MaxGrandParentOverlapBytes(level), GetPathId(ioptions_, mutable_cf_options, output_level), - GetCompressionType(ioptions_, output_level, vstorage->base_level()), - std::move(grandparents), is_manual, score); + GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, + vstorage->base_level()), + std::move(grandparents), is_manual, score, + false /* deletion_compaction */, compaction_reason); // If it's level 0 compaction, make sure we don't execute any other level 0 // compactions in parallel @@ -960,11 +1037,7 @@ Compaction* LevelCompactionPicker::PickCompaction( // takes running compactions into account (by skipping files that are already // being compacted). Since we just changed compaction score, we recalculate it // here - { // this piece of code recomputes compaction score - CompactionOptionsFIFO dummy_compaction_options_fifo; - vstorage->ComputeCompactionScore(mutable_cf_options, - dummy_compaction_options_fifo); - } + vstorage->ComputeCompactionScore(mutable_cf_options); TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); @@ -1019,6 +1092,7 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage, // could be made better by looking at key-ranges that are // being compacted at level 0. if (level == 0 && !level0_compactions_in_progress_.empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); return false; } @@ -1095,18 +1169,19 @@ void UniversalCompactionPicker::SortedRun::Dump(char* out_buf, } void UniversalCompactionPicker::SortedRun::DumpSizeInfo( - char* out_buf, size_t out_buf_size, int sorted_run_count) const { + char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { if (level == 0) { assert(file != nullptr); snprintf(out_buf, out_buf_size, - "file %" PRIu64 - "[%d] " + "file %" PRIu64 "[%" ROCKSDB_PRIszt + "] " "with size %" PRIu64 " (compensated size %" PRIu64 ")", file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(), file->compensated_file_size); } else { snprintf(out_buf, out_buf_size, - "level %d[%d] " + "level %d[%" ROCKSDB_PRIszt + "] " "with size %" PRIu64 " (compensated size %" PRIu64 ")", level, sorted_run_count, size, compensated_file_size); } @@ -1234,8 +1309,9 @@ Compaction* UniversalCompactionPicker::PickCompaction( std::vector sorted_runs = CalculateSortedRuns(*vstorage, ioptions_); - if (sorted_runs.size() < - (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger) { + if (sorted_runs.size() == 0 || + sorted_runs.size() < + (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger) { LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", cf_name.c_str()); return nullptr; } @@ -1356,8 +1432,9 @@ uint32_t UniversalCompactionPicker::GetPathId( // considered in this algorithm. So the target size can be violated in // that case. We need to improve it. uint64_t accumulated_size = 0; - uint64_t future_size = file_size * - (100 - ioptions.compaction_options_universal.size_ratio) / 100; + uint64_t future_size = + file_size * (100 - ioptions.compaction_options_universal.size_ratio) / + 100; uint32_t p = 0; assert(!ioptions.db_paths.empty()); for (; p < ioptions.db_paths.size() - 1; p++) { @@ -1381,22 +1458,27 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( unsigned int max_number_of_files_to_compact, const std::vector& sorted_runs, LogBuffer* log_buffer) { unsigned int min_merge_width = - ioptions_.compaction_options_universal.min_merge_width; + ioptions_.compaction_options_universal.min_merge_width; unsigned int max_merge_width = - ioptions_.compaction_options_universal.max_merge_width; + ioptions_.compaction_options_universal.max_merge_width; const SortedRun* sr = nullptr; bool done = false; - int start_index = 0; + size_t start_index = 0; unsigned int candidate_count = 0; - unsigned int max_files_to_compact = std::min(max_merge_width, - max_number_of_files_to_compact); + unsigned int max_files_to_compact = + std::min(max_merge_width, max_number_of_files_to_compact); min_merge_width = std::max(min_merge_width, 2U); + // Caller checks the size before executing this function. This invariant is + // important because otherwise we may have a possible integer underflow when + // dealing with unsigned types. + assert(sorted_runs.size() > 0); + // Considers a candidate file only if it is smaller than the // total size accumulated so far. - for (unsigned int loop = 0; loop < sorted_runs.size(); loop++) { + for (size_t loop = 0; loop < sorted_runs.size(); loop++) { candidate_count = 0; // Skip files that are already being compacted @@ -1428,7 +1510,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( } // Check if the succeeding files need compaction. - for (unsigned int i = loop + 1; + for (size_t i = loop + 1; candidate_count < max_files_to_compact && i < sorted_runs.size(); i++) { const SortedRun* succeeding_sr = &sorted_runs[i]; @@ -1470,7 +1552,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( done = true; break; } else { - for (unsigned int i = loop; + for (size_t i = loop; i < loop + candidate_count && i < sorted_runs.size(); i++) { const SortedRun* skipping_sr = &sorted_runs[i]; char file_num_buf[256]; @@ -1483,7 +1565,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( if (!done || candidate_count <= 1) { return nullptr; } - unsigned int first_index_after = start_index + candidate_count; + size_t first_index_after = start_index + candidate_count; // Compression is enabled if files compacted earlier already reached // size ratio of compression. bool enable_compression = true; @@ -1498,7 +1580,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( uint64_t older_file_size = 0; for (size_t i = sorted_runs.size() - 1; i >= first_index_after; i--) { older_file_size += sorted_runs[i].size; - if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { + if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { enable_compression = false; break; } @@ -1524,7 +1606,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); } - for (unsigned int i = start_index; i < first_index_after; i++) { + for (size_t i = start_index; i < first_index_after; i++) { auto& picking_sr = sorted_runs[i]; if (picking_sr.level == 0) { FileMetaData* picking_file = picking_sr.file; @@ -1541,11 +1623,19 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( file_num_buf); } + CompactionReason compaction_reason; + if (max_number_of_files_to_compact == UINT_MAX) { + compaction_reason = CompactionReason::kUniversalSortedRunNum; + } else { + compaction_reason = CompactionReason::kUniversalSizeRatio; + } return new Compaction( vstorage, mutable_cf_options, std::move(inputs), output_level, mutable_cf_options.MaxFileSizeForLevel(output_level), LLONG_MAX, path_id, - GetCompressionType(ioptions_, start_level, 1, enable_compression), - /* grandparents */ {}, /* is manual */ false, score); + GetCompressionType(ioptions_, vstorage, mutable_cf_options, start_level, + 1, enable_compression), + /* grandparents */ {}, /* is manual */ false, score, + false /* deletion_compaction */, compaction_reason); } // Look at overall size amplification. If size amplification @@ -1559,19 +1649,19 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( VersionStorageInfo* vstorage, double score, const std::vector& sorted_runs, LogBuffer* log_buffer) { // percentage flexibilty while reducing size amplification - uint64_t ratio = ioptions_.compaction_options_universal. - max_size_amplification_percent; + uint64_t ratio = + ioptions_.compaction_options_universal.max_size_amplification_percent; unsigned int candidate_count = 0; uint64_t candidate_size = 0; - unsigned int start_index = 0; + size_t start_index = 0; const SortedRun* sr = nullptr; // Skip files that are already being compacted - for (unsigned int loop = 0; loop < sorted_runs.size() - 1; loop++) { + for (size_t loop = 0; loop < sorted_runs.size() - 1; loop++) { sr = &sorted_runs[loop]; if (!sr->being_compacted) { - start_index = loop; // Consider this as the first candidate. + start_index = loop; // Consider this as the first candidate. break; } char file_num_buf[kFormatFileNumberBufSize]; @@ -1583,18 +1673,19 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( } if (sr == nullptr) { - return nullptr; // no candidate files + return nullptr; // no candidate files } { char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); - LogToBuffer(log_buffer, "[%s] Universal: First candidate %s[%d] %s", + LogToBuffer(log_buffer, + "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", cf_name.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); } // keep adding up all the remaining files - for (unsigned int loop = start_index; loop < sorted_runs.size() - 1; loop++) { + for (size_t loop = start_index; loop < sorted_runs.size() - 1; loop++) { sr = &sorted_runs[loop]; if (sr->being_compacted) { char file_num_buf[kFormatFileNumberBufSize]; @@ -1620,21 +1711,21 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( LogToBuffer( log_buffer, "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 - "earliest-file-size %" PRIu64, + " earliest-file-size %" PRIu64, cf_name.c_str(), candidate_size, earliest_file_size); return nullptr; } else { LogToBuffer( log_buffer, "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 - "earliest-file-size %" PRIu64, + " earliest-file-size %" PRIu64, cf_name.c_str(), candidate_size, earliest_file_size); } assert(start_index < sorted_runs.size() - 1); // Estimate total file size uint64_t estimated_total_size = 0; - for (unsigned int loop = start_index; loop < sorted_runs.size(); loop++) { + for (size_t loop = start_index; loop < sorted_runs.size(); loop++) { estimated_total_size += sorted_runs[loop].size; } uint32_t path_id = GetPathId(ioptions_, estimated_total_size); @@ -1645,7 +1736,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( inputs[i].level = start_level + static_cast(i); } // We always compact all the files, so always compress. - for (unsigned int loop = start_index; loop < sorted_runs.size(); loop++) { + for (size_t loop = start_index; loop < sorted_runs.size(); loop++) { auto& picking_sr = sorted_runs[loop]; if (picking_sr.level == 0) { FileMetaData* f = picking_sr.file; @@ -1657,7 +1748,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( } } char file_num_buf[256]; - sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); LogToBuffer(log_buffer, "[%s] Universal: size amp picking %s", cf_name.c_str(), file_num_buf); } @@ -1667,12 +1758,15 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( vstorage->num_levels() - 1, mutable_cf_options.MaxFileSizeForLevel(vstorage->num_levels() - 1), /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, - GetCompressionType(ioptions_, vstorage->num_levels() - 1, 1), - /* grandparents */ {}, /* is manual */ false, score); + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + vstorage->num_levels() - 1, 1), + /* grandparents */ {}, /* is manual */ false, score, + false /* deletion_compaction */, + CompactionReason::kUniversalSizeAmplification); } -bool FIFOCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage) - const { +bool FIFOCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { const int kLevel0 = 0; return vstorage->CompactionScore(kLevel0) >= 1; } @@ -1727,7 +1821,7 @@ Compaction* FIFOCompactionPicker::PickCompaction( Compaction* c = new Compaction( vstorage, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, kNoCompression, {}, /* is manual */ false, vstorage->CompactionScore(0), - /* is deletion compaction */ true); + /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize); level0_compactions_in_progress_.insert(c); return c; } @@ -1736,7 +1830,7 @@ Compaction* FIFOCompactionPicker::CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, int input_level, int output_level, uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) { + InternalKey** compaction_end, bool* manual_conflict) { assert(input_level == 0); assert(output_level == 0); *compaction_end = nullptr; diff --git a/external/rocksdb/db/compaction_picker.h b/external/rocksdb/db/compaction_picker.h index e7d8bf6dbf..fca7319598 100644 --- a/external/rocksdb/db/compaction_picker.h +++ b/external/rocksdb/db/compaction_picker.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -60,29 +60,28 @@ class CompactionPicker { const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, int input_level, int output_level, uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end); + InternalKey** compaction_end, bool* manual_conflict); // The maximum allowed output level. Default value is NumberLevels() - 1. - virtual int MaxOutputLevel() const { - return NumberLevels() - 1; - } + virtual int MaxOutputLevel() const { return NumberLevels() - 1; } virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; - // Sanitize the input set of compaction input files. - // When the input parameters do not describe a valid compaction, the - // function will try to fix the input_files by adding necessary - // files. If it's not possible to conver an invalid input_files - // into a valid one by adding more files, the function will return a - // non-ok status with specific reason. +// Sanitize the input set of compaction input files. +// When the input parameters do not describe a valid compaction, the +// function will try to fix the input_files by adding necessary +// files. If it's not possible to conver an invalid input_files +// into a valid one by adding more files, the function will return a +// non-ok status with specific reason. #ifndef ROCKSDB_LITE - Status SanitizeCompactionInputFiles( - std::unordered_set* input_files, - const ColumnFamilyMetaData& cf_meta, - const int output_level) const; + Status SanitizeCompactionInputFiles(std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; #endif // ROCKSDB_LITE // Free up the files that participated in a compaction + // + // Requirement: DB mutex held void ReleaseCompactionFiles(Compaction* c, Status status); // Returns true if any one of the specified files are being compacted @@ -94,7 +93,7 @@ class CompactionPicker { const CompactionOptions& compact_options, const std::vector& input_files, int output_level, VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, - uint32_t output_path_id) const; + uint32_t output_path_id); // Converts a set of compaction input file numbers into // a list of CompactionInputFiles. @@ -121,15 +120,15 @@ class CompactionPicker { // Stores the minimal range that covers all entries in inputs in // *smallest, *largest. // REQUIRES: inputs is not empty - void GetRange(const CompactionInputFiles& inputs, - InternalKey* smallest, InternalKey* largest); + void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest, + InternalKey* largest); // Stores the minimal range that covers all entries in inputs1 and inputs2 // in *smallest, *largest. // REQUIRES: inputs is not empty void GetRange(const CompactionInputFiles& inputs1, - const CompactionInputFiles& inputs2, - InternalKey* smallest, InternalKey* largest); + const CompactionInputFiles& inputs2, InternalKey* smallest, + InternalKey* largest); // Add more files to the inputs on "level" to make sure that // no newer version of a key is compacted to "level+1" while leaving an older @@ -164,13 +163,12 @@ class CompactionPicker { const ImmutableCFOptions& ioptions_; - // A helper function to SanitizeCompactionInputFiles() that - // sanitizes "input_files" by adding necessary files. +// A helper function to SanitizeCompactionInputFiles() that +// sanitizes "input_files" by adding necessary files. #ifndef ROCKSDB_LITE virtual Status SanitizeCompactionInputFilesForAllLevels( std::unordered_set* input_files, - const ColumnFamilyMetaData& cf_meta, - const int output_level) const; + const ColumnFamilyMetaData& cf_meta, const int output_level) const; #endif // ROCKSDB_LITE // Keeps track of all compactions that are running on Level0. @@ -190,8 +188,8 @@ class LevelCompactionPicker : public CompactionPicker { VersionStorageInfo* vstorage, LogBuffer* log_buffer) override; - virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const - override; + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; // Pick a path ID to place a newly generated file, with its level static uint32_t GetPathId(const ImmutableCFOptions& ioptions, @@ -230,8 +228,8 @@ class UniversalCompactionPicker : public CompactionPicker { virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } - virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const - override; + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; private: struct SortedRun { @@ -251,7 +249,7 @@ class UniversalCompactionPicker : public CompactionPicker { // sorted_run_count is added into the string to print void DumpSizeInfo(char* out_buf, size_t out_buf_size, - int sorted_run_count) const; + size_t sorted_run_count) const; int level; // `file` Will be null for level > 0. For level = 0, the sorted run is @@ -302,22 +300,20 @@ class FIFOCompactionPicker : public CompactionPicker { const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, int input_level, int output_level, uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) override; + InternalKey** compaction_end, bool* manual_conflict) override; // The maximum allowed output level. Always returns 0. - virtual int MaxOutputLevel() const override { - return 0; - } + virtual int MaxOutputLevel() const override { return 0; } - virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const - override; + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; }; class NullCompactionPicker : public CompactionPicker { public: NullCompactionPicker(const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icmp) : - CompactionPicker(ioptions, icmp) {} + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} virtual ~NullCompactionPicker() {} // Always return "nullptr" @@ -329,23 +325,27 @@ class NullCompactionPicker : public CompactionPicker { } // Always return "nullptr" - Compaction* CompactRange( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, int input_level, int output_level, - uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) override { + Compaction* CompactRange(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, + int output_level, uint32_t output_path_id, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, + bool* manual_conflict) override { return nullptr; } // Always returns false. - virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const - override { + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override { return false; } }; #endif // !ROCKSDB_LITE CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, int level, int base_level, const bool enable_compression = true); diff --git a/external/rocksdb/db/compaction_picker_test.cc b/external/rocksdb/db/compaction_picker_test.cc index ef86058ccb..2d3265421a 100644 --- a/external/rocksdb/db/compaction_picker_test.cc +++ b/external/rocksdb/db/compaction_picker_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -79,7 +79,7 @@ class CompactionPickerTest : public testing::Test { } void Add(int level, uint32_t file_number, const char* smallest, - const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, + const char* largest, uint64_t file_size = 1, uint32_t path_id = 0, SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100) { assert(level < vstorage_->num_levels()); @@ -119,7 +119,7 @@ class CompactionPickerTest : public testing::Test { vstorage_->UpdateNumNonEmptyLevels(); vstorage_->GenerateFileIndexer(); vstorage_->GenerateLevelFilesBrief(); - vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_); + vstorage_->ComputeCompactionScore(mutable_cf_options_); vstorage_->GenerateLevel0NonOverlapping(); vstorage_->SetFinalized(); } @@ -195,6 +195,7 @@ TEST_F(CompactionPickerTest, LevelMaxScore) { NewVersionStorage(6, kCompactionStyleLevel); mutable_cf_options_.target_file_size_base = 10000000; mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; Add(0, 1U, "150", "200", 1000000000U); // Level 1 score 1.2 Add(1, 66U, "150", "200", 6000000U); @@ -319,6 +320,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { mutable_cf_options_.level0_file_num_compaction_trigger = 2; mutable_cf_options_.max_bytes_for_level_base = 200; mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); Add(0, 1U, "150", "200"); Add(0, 2U, "200", "250"); @@ -351,6 +353,7 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { mutable_cf_options_.level0_file_num_compaction_trigger = 2; mutable_cf_options_.max_bytes_for_level_base = 200; mutable_cf_options_.max_bytes_for_level_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; NewVersionStorage(num_levels, kCompactionStyleLevel); Add(0, 1U, "150", "200"); Add(num_levels - 1, 3U, "200", "250", 300U); @@ -366,11 +369,9 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); - ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); - ASSERT_EQ(2U, compaction->num_input_files(1)); - ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber()); - ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber()); - ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(num_levels - 1, compaction->output_level()); } @@ -466,7 +467,6 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { fifo_options_.max_table_files_size = kMaxSize; ioptions_.compaction_options_fifo = fifo_options_; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); - UpdateVersionStorageInfo(); // must return false when there's no files. ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false); @@ -487,6 +487,90 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { } #endif // ROCKSDB_LITE +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smalelst overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 26U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + // This test exhibits the bug where we don't properly reset parent_index in // PickCompaction() TEST_F(CompactionPickerTest, ParentIndexResetBug) { @@ -514,6 +598,8 @@ TEST_F(CompactionPickerTest, ParentIndexResetBug) { // ranges (with different sequence numbers) in the input files. TEST_F(CompactionPickerTest, OverlappingUserKeys) { NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.compaction_pri = kByCompensatedSize; + Add(1, 1U, "100", "150", 1U); // Overlapping user keys Add(1, 2U, "200", "400", 1U); @@ -535,9 +621,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys2) { // Overlapping user keys on same level and output level Add(1, 1U, "200", "400", 1000000000U); Add(1, 2U, "400", "500", 1U, 0, 0); - Add(2, 3U, "400", "600", 1U); - // The following file is not in the compaction despite overlapping user keys - Add(2, 4U, "600", "700", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( @@ -545,10 +631,12 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys2) { ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); - ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->num_input_files(1)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber()); } TEST_F(CompactionPickerTest, OverlappingUserKeys3) { @@ -580,6 +668,126 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys3) { ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); } +TEST_F(CompactionPickerTest, OverlappingUserKeys4) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_bytes_for_level_base = 1000000; + + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "199", 1U, 0, 0); + Add(1, 3U, "200", "250", 1100000U, 0, 0); + Add(1, 4U, "251", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + + Add(2, 6U, "100", "115", 1U); + Add(2, 7U, "125", "325", 1U); + Add(2, 8U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // No compaction should be scheduled, if L0 has higher priority than L1 + // but L0->L1 compaction is blocked by a file in L1 being compacted. + UpdateVersionStorageInfo(); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If no file in L1 being compacted, L0->L1 compaction will be scheduled. + UpdateVersionStorageInfo(); // being_compacted flag is cleared here. + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 score more than 6. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + Add(1, 51U, "351", "400", 6000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If score in L1 is larger than L0, L1 compaction goes through despite + // there is pending L0 compaction. + UpdateVersionStorageInfo(); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { int num_levels = ioptions_.num_levels; ioptions_.level_compaction_dynamic_level_bytes = false; @@ -594,19 +802,22 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { Add(1, 4U, "400", "500", 600); Add(1, 5U, "600", "700", 600); // Level 2 is less than target 10000 even added size of level 1 + // Size ratio of L2/L1 is 9600 / 1200 = 8 Add(2, 6U, "150", "200", 2500); Add(2, 7U, "201", "210", 2000); - Add(2, 8U, "300", "310", 2500); + Add(2, 8U, "300", "310", 2600); Add(2, 9U, "400", "500", 2500); // Level 3 exceeds target 100,000 of 1000 Add(3, 10U, "400", "500", 101000); - // Level 4 exceeds target 1,000,000 of 500 after adding size from level 3 - Add(4, 11U, "400", "500", 999500); - Add(5, 11U, "400", "500", 8000000); + // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3 + // Size ratio L4/L3 is 9.9 + // After merge from L3, L4 size is 1000900 + Add(4, 11U, "400", "500", 999900); + Add(5, 11U, "400", "500", 8007200); UpdateVersionStorageInfo(); - ASSERT_EQ(2200u + 11000u + 5500u, + ASSERT_EQ(200u * 9u + 10900u + 900u * 9, vstorage_->estimated_compaction_needed_bytes()); } @@ -622,17 +833,42 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) { Add(0, 4U, "150", "200", 200); Add(0, 5U, "150", "200", 200); Add(0, 6U, "150", "200", 200); - // Level 1 is over target by + // Level 1 size will be 1400 after merging with L0 Add(1, 7U, "400", "500", 200); Add(1, 8U, "600", "700", 200); // Level 2 is less than target 10000 even added size of level 1 - Add(2, 9U, "150", "200", 9500); + Add(2, 9U, "150", "200", 9100); + // Level 3 over the target, but since level 4 is empty, we assume it will be + // a trivial move. Add(3, 10U, "400", "500", 101000); UpdateVersionStorageInfo(); - ASSERT_EQ(1400u + 4400u + 11000u, - vstorage_->estimated_compaction_needed_bytes()); + // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0) + ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 2000); + Add(0, 2U, "150", "200", 2000); + Add(0, 4U, "150", "200", 2000); + Add(0, 5U, "150", "200", 2000); + Add(0, 6U, "150", "200", 1000); + // Level 1 size will be 10000 after merging with L0 + Add(1, 7U, "400", "500", 500); + Add(1, 8U, "600", "700", 500); + + Add(2, 9U, "150", "200", 10000); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes()); } TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { @@ -656,12 +892,14 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { // num_levels - 3 is over target by 100 + 1000 Add(num_levels - 3, 7U, "400", "500", 300); Add(num_levels - 3, 8U, "600", "700", 300); - // Level 2 is over target by 1100 + 100 - Add(num_levels - 2, 9U, "150", "200", 5100); + // num_levels - 2 is over target by 1100 + 200 + Add(num_levels - 2, 9U, "150", "200", 5200); UpdateVersionStorageInfo(); - ASSERT_EQ(1600u + 12100u + 13200u, + // Merging to the second last level: (5200 / 1600 + 1) * 1100 + // Merging to the last level: (50000 / 6300 + 1) * 1300 + ASSERT_EQ(1600u + 4675u + 11617u, vstorage_->estimated_compaction_needed_bytes()); } diff --git a/external/rocksdb/db/comparator_db_test.cc b/external/rocksdb/db/comparator_db_test.cc index cb944a76a9..e4e84107ef 100644 --- a/external/rocksdb/db/comparator_db_test.cc +++ b/external/rocksdb/db/comparator_db_test.cc @@ -1,16 +1,17 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #include #include +#include "memtable/stl_wrappers.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "util/hash.h" -#include "util/stl_wrappers.h" +#include "util/kv_map.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" diff --git a/external/rocksdb/db/convenience.cc b/external/rocksdb/db/convenience.cc index 17f7812523..b1042c74d5 100644 --- a/external/rocksdb/db/convenience.cc +++ b/external/rocksdb/db/convenience.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,6 +18,13 @@ namespace rocksdb { void CancelAllBackgroundWork(DB* db, bool wait) { (dynamic_cast(db))->CancelAllBackgroundWork(wait); } + +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + return (dynamic_cast(db)) + ->DeleteFilesInRange(column_family, begin, end); +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/external/rocksdb/db/corruption_test.cc b/external/rocksdb/db/corruption_test.cc index 81cff970f9..e7d82407ae 100644 --- a/external/rocksdb/db/corruption_test.cc +++ b/external/rocksdb/db/corruption_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,6 +11,7 @@ #include "rocksdb/db.h" +#include #include #include #include @@ -41,6 +42,7 @@ class CorruptionTest : public testing::Test { CorruptionTest() { tiny_cache_ = NewLRUCache(100); + options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; options_.env = &env_; dbname_ = test::TmpDir() + "/corruption_test"; DestroyDB(dbname_, options_); @@ -104,8 +106,8 @@ class CorruptionTest : public testing::Test { } void Check(int min_expected, int max_expected) { - unsigned int next_expected = 0; - int missed = 0; + uint64_t next_expected = 0; + uint64_t missed = 0; int bad_keys = 0; int bad_values = 0; int correct = 0; @@ -126,7 +128,7 @@ class CorruptionTest : public testing::Test { continue; } missed += (key - next_expected); - next_expected = static_cast(key + 1); + next_expected = key + 1; if (iter->value() != Value(static_cast(key), &value_space)) { bad_values++; } else { @@ -136,8 +138,9 @@ class CorruptionTest : public testing::Test { delete iter; fprintf(stderr, - "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", - min_expected, max_expected, correct, bad_keys, bad_values, missed); + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n", + min_expected, max_expected, correct, bad_keys, bad_values, + static_cast(missed)); ASSERT_LE(min_expected, correct); ASSERT_GE(max_expected, correct); } @@ -183,7 +186,7 @@ class CorruptionTest : public testing::Test { FileType type; std::string fname; int picked_number = -1; - for (unsigned int i = 0; i < filenames.size(); i++) { + for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type == filetype && static_cast(number) > picked_number) { // Pick latest file @@ -232,8 +235,16 @@ class CorruptionTest : public testing::Test { // Return the value to associate with the specified key Slice Value(int k, std::string* storage) { - Random r(k); - return test::RandomString(&r, kValueSize, storage); + if (k == 0) { + // Ugh. Random seed of 0 used to produce no entropy. This code + // preserves the implementation that was in place when all of the + // magic values in this file were picked. + *storage = std::string(kValueSize, ' '); + return Slice(*storage); + } else { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } } }; diff --git a/external/rocksdb/db/cuckoo_table_db_test.cc b/external/rocksdb/db/cuckoo_table_db_test.cc index 09a68de921..f48b5b436c 100644 --- a/external/rocksdb/db/cuckoo_table_db_test.cc +++ b/external/rocksdb/db/cuckoo_table_db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/db_block_cache_test.cc b/external/rocksdb/db/db_block_cache_test.cc new file mode 100644 index 0000000000..96d5be980d --- /dev/null +++ b/external/rocksdb/db/db_block_cache_test.cc @@ -0,0 +1,496 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace rocksdb { + +class DBBlockCacheTest : public DBTestBase { + private: + size_t miss_count_ = 0; + size_t hit_count_ = 0; + size_t insert_count_ = 0; + size_t failure_count_ = 0; + size_t compressed_miss_count_ = 0; + size_t compressed_hit_count_ = 0; + size_t compressed_insert_count_ = 0; + size_t compressed_failure_count_ = 0; + + public: + const size_t kNumBlocks = 10; + const size_t kValueSize = 100; + + DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {} + + BlockBasedTableOptions GetTableOptions() { + BlockBasedTableOptions table_options; + // Set a small enough block size so that each key-value get its own block. + table_options.block_size = 1; + return table_options; + } + + Options GetOptions(const BlockBasedTableOptions& table_options) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.avoid_flush_during_recovery = false; + // options.compression = kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + return options; + } + + void InitTable(const Options& options) { + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value.c_str())); + } + } + + void RecordCacheCounters(const Options& options) { + miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS); + hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT); + insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD); + failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + compressed_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + compressed_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + compressed_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + compressed_failure_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } + + void CheckCacheCounters(const Options& options, size_t expected_misses, + size_t expected_hits, size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS); + size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT); + size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + ASSERT_EQ(miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(failure_count_ + expected_failures, new_failure_count); + miss_count_ = new_miss_count; + hit_count_ = new_hit_count; + insert_count_ = new_insert_count; + failure_count_ = new_failure_count; + } + + void CheckCompressedCacheCounters(const Options& options, + size_t expected_misses, + size_t expected_hits, + size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + size_t new_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + size_t new_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count); + compressed_miss_count_ = new_miss_count; + compressed_hit_count_ = new_hit_count; + compressed_insert_count_ = new_insert_count; + compressed_failure_count_ = new_failure_count; + } +}; + +TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + table_options.block_cache = cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + cache->SetCapacity(usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + + // Test with strict capacity limit. + cache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsIncomplete()); + CheckCacheCounters(options, 1, 0, 0, 1); + delete iter; + iter = nullptr; + + // Release interators and access cache again. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iterators[i].reset(); + CheckCacheCounters(options, 0, 0, 0, 0); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 0, 1, 0, 0); + iterators[i].reset(iter); + } +} + +#ifdef SNAPPY +TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + options.compression = CompressionType::kSnappyCompression; + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + std::shared_ptr compressed_cache = NewLRUCache(0, 0, false); + table_options.block_cache = cache; + table_options.block_cache_compressed = compressed_cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + size_t compressed_usage = compressed_cache->GetUsage(); + ASSERT_LT(0, compressed_usage); + // Compressed block cache cannot be pinned. + ASSERT_EQ(0, compressed_cache->GetPinnedUsage()); + + // Set strict capacity limit flag. Now block will only load into compressed + // block cache. + cache->SetCapacity(usage); + cache->SetStrictCapacityLimit(true); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + // compressed_cache->SetCapacity(compressed_usage); + compressed_cache->SetCapacity(0); + // compressed_cache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsIncomplete()); + CheckCacheCounters(options, 1, 0, 0, 1); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + delete iter; + iter = nullptr; + + // Clear strict capacity limit flag. This time we shall hit compressed block + // cache. + cache->SetStrictCapacityLimit(false); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 0, 1, 0, 0); + delete iter; + iter = nullptr; +} +#endif // SNAPPY + +#ifndef ROCKSDB_LITE + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + uint64_t int_num; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(index_block_hit + 1, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(index_block_hit + 2, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + // 200 bytes are enough to hold the first two blocks + std::shared_ptr cache = NewLRUCache(200, 0, false); + table_options.block_cache = cache; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table + ASSERT_OK(Flush(1)); + size_t index_bytes_insert = + TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT); + size_t filter_bytes_insert = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT); + ASSERT_GT(index_bytes_insert, 0); + ASSERT_GT(filter_bytes_insert, 0); + ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); + // set the cache capacity to the current usage + cache->SetCapacity(index_bytes_insert + filter_bytes_insert); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); + ASSERT_OK(Put(1, "key2", "val")); + // Create a new table + ASSERT_OK(Flush(1)); + // cache evicted old index and block entries + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT), + index_bytes_insert); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), + filter_bytes_insert); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), + index_bytes_insert); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), + filter_bytes_insert); +} + +TEST_F(DBBlockCacheTest, ParanoidFileChecks) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.level0_file_num_compaction_trigger = 2; + options.paranoid_file_checks = true; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "1_key", "val")); + ASSERT_OK(Put(1, "9_key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + ASSERT_EQ(1, /* read and cache data block */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Put(1, "1_key2", "val2")); + ASSERT_OK(Put(1, "9_key2", "val2")); + // Create a new SST file. This will further trigger a compaction + // and generate another file. + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // After disabling options.paranoid_file_checks. NO further block + // is added after generating a new file. + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}})); + + ASSERT_OK(Put(1, "1_key3", "val3")); + ASSERT_OK(Put(1, "9_key3", "val3")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "1_key4", "val4")); + ASSERT_OK(Put(1, "9_key4", "val4")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); +} + +TEST_F(DBBlockCacheTest, CompressedCache) { + if (!Snappy_Supported()) { + return; + } + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + // Iteration 4: both block cache and compressed cache, but DB is not + // compressed + for (int iter = 0; iter < 4; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + switch (iter) { + case 0: + // only uncompressed block cache + table_options.block_cache = NewLRUCache(8 * 1024); + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 1: + // no block cache, only compressed cache + table_options.no_block_cache = true; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 2: + // both compressed and uncompressed block cache + table_options.block_cache = NewLRUCache(1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 3: + // both block cache and compressed cache, but DB is not compressed + // also, make block cache sizes bigger, to trigger block cache hits + table_options.block_cache = NewLRUCache(1024 * 1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + break; + default: + ASSERT_TRUE(false); + } + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 3: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + // compressed doesn't have any hits since blocks are not compressed on + // storage + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); + break; + default: + ASSERT_TRUE(false); + } + + options.create_if_missing = true; + DestroyAndReopen(options); + } +} + +#endif // ROCKSDB_LITE + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_bloom_filter_test.cc b/external/rocksdb/db/db_bloom_filter_test.cc new file mode 100644 index 0000000000..07f2409ab3 --- /dev/null +++ b/external/rocksdb/db/db_bloom_filter_test.cc @@ -0,0 +1,1028 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" + +namespace rocksdb { + +// DB tests related to bloom filter. + +class DBBloomFilterTest : public DBTestBase { + public: + DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {} +}; + +class DBBloomFilterTestWithParam : public DBTestBase, + public testing::WithParamInterface { + protected: + bool use_block_based_filter_; + + public: + DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {} + + ~DBBloomFilterTestWithParam() {} + + void SetUp() override { use_block_based_filter_ = GetParam(); } +}; + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST_P(DBBloomFilterTestWithParam, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + anon::OptionsOverride options_override; + options_override.filter_policy.reset( + NewBloomFilterPolicy(20, use_block_based_filter_)); + Options options = CurrentOptions(options_override); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + dbfull()->Flush(fo); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + + ro.total_order_seek = true; + ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound()); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); +} + +TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + dbfull()->Flush(fo); + + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + Flush(); + + // Now we have two files: + // File 1: An older file with prefix bloom. + // File 2: A newer file with whole bloom filter. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + + // Reopen with the same setting: only whole key is used + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); +} + +TEST_P(DBBloomFilterTestWithParam, BloomFilter) { + do { + Options options = CurrentOptions(); + env_->count_random_reads_ = true; + options.env = env_; + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_filter_)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.store(true, std::memory_order_release); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + ASSERT_LE(reads, N + 2 * N / 100); + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + ASSERT_LE(reads, 3 * N / 100); + + env_->delay_sstable_sync_.store(false, std::memory_order_release); + Close(); + } while (ChangeCompactOptions()); +} + +INSTANTIATE_TEST_CASE_P(DBBloomFilterTestWithParam, DBBloomFilterTestWithParam, + ::testing::Bool()); + +TEST_F(DBBloomFilterTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + } +} + +TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with block based filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with full filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with block_based filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +namespace { +// A wrapped bloom over default FilterPolicy +class WrappedBloom : public FilterPolicy { + public: + explicit WrappedBloom(int bits_per_key) + : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {} + + ~WrappedBloom() { delete filter_; } + + const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } + + void CreateFilter(const rocksdb::Slice* keys, int n, + std::string* dst) const override { + std::unique_ptr user_keys(new rocksdb::Slice[n]); + for (int i = 0; i < n; ++i) { + user_keys[i] = convertKey(keys[i]); + } + return filter_->CreateFilter(user_keys.get(), n, dst); + } + + bool KeyMayMatch(const rocksdb::Slice& key, + const rocksdb::Slice& filter) const override { + counter_++; + return filter_->KeyMayMatch(convertKey(key), filter); + } + + uint32_t GetCounter() { return counter_; } + + private: + const FilterPolicy* filter_; + mutable uint32_t counter_; + + rocksdb::Slice convertKey(const rocksdb::Slice& key) const { return key; } +}; +} // namespace + +TEST_F(DBBloomFilterTest, BloomFilterWrapper) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + WrappedBloom* policy = new WrappedBloom(10); + table_options.filter_policy.reset(policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + ASSERT_EQ(0U, policy->GetCounter()); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ(1U * maxKey, policy->GetCounter()); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_EQ(2U * maxKey, policy->GetCounter()); +} + +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(DBBloomFilterTest, PrefixExtractorFullFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) { + BlockBasedTableOptions bbto; + // Block Filter Block + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +#ifndef ROCKSDB_LITE +class BloomStatsTestWithParam + : public DBBloomFilterTest, + public testing::WithParamInterface> { + public: + BloomStatsTestWithParam() { + use_block_table_ = std::get<0>(GetParam()); + use_block_based_builder_ = std::get<1>(GetParam()); + + options_.create_if_missing = true; + options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4)); + options_.memtable_prefix_bloom_size_ratio = + 8.0 * 1024.0 / static_cast(options_.write_buffer_size); + if (use_block_table_) { + BlockBasedTableOptions table_options; + table_options.hash_index_allow_collision = false; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder_)); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } else { + PlainTableOptions table_options; + options_.table_factory.reset(NewPlainTableFactory(table_options)); + } + + perf_context.Reset(); + DestroyAndReopen(options_); + } + + ~BloomStatsTestWithParam() { + perf_context.Reset(); + Destroy(options_); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool use_block_table_; + bool use_block_based_builder_; + Options options_; +}; + +// 1 Insert 2 K-V pairs into DB +// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 +// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 +// 4 Call Flush() to create SST +// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 +// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 +// Test both: block and plain SST +TEST_P(BloomStatsTestWithParam, BloomStatsTest) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + + // sanity checks + ASSERT_EQ(0, perf_context.bloom_sst_hit_count); + ASSERT_EQ(0, perf_context.bloom_sst_miss_count); + + Flush(); + + // sanity checks + ASSERT_EQ(0, perf_context.bloom_sst_hit_count); + ASSERT_EQ(0, perf_context.bloom_sst_miss_count); + + // check SST bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, perf_context.bloom_sst_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, perf_context.bloom_sst_hit_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, perf_context.bloom_sst_miss_count); +} + +// Same scenario as in BloomStatsTest but using an iterator +TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + + // check memtable bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); + ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); + + Flush(); + + iter.reset(dbfull()->NewIterator(ReadOptions())); + + // Check SST bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, perf_context.bloom_sst_hit_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, perf_context.bloom_sst_hit_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, perf_context.bloom_sst_miss_count); + ASSERT_EQ(2, perf_context.bloom_sst_hit_count); +} + +INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, + ::testing::Values(std::make_tuple(true, true), + std::make_tuple(true, false), + std::make_tuple(false, false))); + +namespace { +void PrefixScanInit(DBBloomFilterTest* dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // namespace + +TEST_F(DBBloomFilterTest, PrefixScan) { + XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, + kSkipNoPrefix); + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + ASSERT_EQ(key.difference_offset(prefix), 8); + ASSERT_EQ(prefix.difference_offset(key), 8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (!iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while + XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0); +} + +TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; + options.arena_block_size = 4 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 256 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.compression = kNoCompression; + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, true)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 200000; + + // Generate randomly shuffled keys, so the updates are almost + // random. + std::vector keys; + keys.reserve(numkeys); + for (int i = 0; i < numkeys; i += 2) { + keys.push_back(i); + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + int num_inserted = 0; + for (int key : keys) { + ASSERT_OK(Put(1, Key(key), "val")); + if (++num_inserted % 1000 == 0) { + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + } + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + if (NumTableFilesAtLevel(0, 1) == 0) { + // No Level 0 file. Create one. + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + } + + for (int i = 1; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + } + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + // Now we have three sorted run, L0, L5 and L6 with most files in L6 have + // no bloom filter. Most keys be checked bloom filters twice. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); + ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + + for (int i = 0; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + // Part 2 (read path): rewrite last level with blooms, then verify they get + // cached only if !optimize_filters_for_hits + options.disable_auto_compactions = true; + options.num_levels = 9; + options.optimize_filters_for_hits = false; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + MoveFilesToLevel(7 /* level */, 1 /* column family index */); + + std::string value = Get(1, Key(0)); + uint64_t prev_cache_filter_hits = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_hits + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Now that we know the filter blocks exist in the last level files, see if + // filter caching is skipped for this optimization + options.optimize_filters_for_hits = true; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + value = Get(1, Key(0)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // Check filter block ignored for files preloaded during DB::Open() + options.max_open_files = -1; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + uint64_t prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Check filter block ignored for file trivially-moved to bottom level + bbto.block_cache.reset(); + options.max_open_files = 100; // setting > -1 makes it not preload all files + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + ASSERT_OK(Put(1, Key(numkeys + 1), "val")); + ASSERT_OK(Flush(1)); + + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* arg) { trivial_move++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* arg) { non_trivial_move++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + compact_options.change_level = true; + compact_options.target_level = 7; + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + value = Get(1, Key(numkeys + 1)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + + // Check filter block not cached for iterator + bbto.block_cache.reset(); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); +} + +#endif // ROCKSDB_LITE + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_compaction_filter_test.cc b/external/rocksdb/db/db_compaction_filter_test.cc index a1587f283c..ff6945cf83 100644 --- a/external/rocksdb/db/db_compaction_filter_test.cc +++ b/external/rocksdb/db/db_compaction_filter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,8 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" namespace rocksdb { @@ -47,6 +47,24 @@ class DeleteFilter : public CompactionFilter { virtual const char* Name() const override { return "DeleteFilter"; } }; +class DeleteISFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, + bool* value_changed) const override { + cfilter_count++; + int i = std::stoi(key.ToString()); + if (i > 5 && i <= 105) { + return true; + } + return false; + } + + virtual bool IgnoreSnapshots() const override { return true; } + + virtual const char* Name() const override { return "DeleteFilter"; } +}; + class DelayFilter : public CompactionFilter { public: explicit DelayFilter(DBTestBase* d) : db_test(d) {} @@ -97,8 +115,11 @@ class ChangeFilter : public CompactionFilter { class KeepFilterFactory : public CompactionFilterFactory { public: - explicit KeepFilterFactory(bool check_context = false) - : check_context_(check_context) {} + explicit KeepFilterFactory(bool check_context = false, + bool check_context_cf_id = false) + : check_context_(check_context), + check_context_cf_id_(check_context_cf_id), + compaction_filter_created_(false) {} virtual std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) override { @@ -106,13 +127,22 @@ class KeepFilterFactory : public CompactionFilterFactory { EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); } + if (check_context_cf_id_) { + EXPECT_EQ(expect_cf_id_.load(), context.column_family_id); + } + compaction_filter_created_ = true; return std::unique_ptr(new KeepFilter()); } + bool compaction_filter_created() const { return compaction_filter_created_; } + virtual const char* Name() const override { return "KeepFilterFactory"; } bool check_context_; + bool check_context_cf_id_; std::atomic_bool expect_full_compaction_; std::atomic_bool expect_manual_compaction_; + std::atomic expect_cf_id_; + bool compaction_filter_created_; }; class DeleteFilterFactory : public CompactionFilterFactory { @@ -129,6 +159,21 @@ class DeleteFilterFactory : public CompactionFilterFactory { virtual const char* Name() const override { return "DeleteFilterFactory"; } }; +// Delete Filter Factory which ignores snapshots +class DeleteISFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new DeleteISFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + virtual const char* Name() const override { return "DeleteFilterFactory"; } +}; + class DelayFilterFactory : public CompactionFilterFactory { public: explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} @@ -174,6 +219,7 @@ class ChangeFilterFactory : public CompactionFilterFactory { virtual const char* Name() const override { return "ChangeFilterFactory"; } }; +#ifndef ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilter) { Options options = CurrentOptions(); options.max_open_files = -1; @@ -216,7 +262,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { Arena arena; { ScopedArenaIterator iter( - dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + dbfull()->NewInternalIterator(&arena, handles_[1])); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { @@ -304,7 +350,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { count = 0; { ScopedArenaIterator iter( - dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + dbfull()->NewInternalIterator(&arena, handles_[1])); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { @@ -322,11 +368,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // entries are deleted. The compaction should create bunch of 'DeleteFile' // entries in VersionEdit, but none of the 'AddFile's. TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { - Options options; + Options options = CurrentOptions(); options.compaction_filter_factory = std::make_shared(); options.disable_auto_compactions = true; options.create_if_missing = true; - options = CurrentOptions(options); DestroyAndReopen(options); // put some data @@ -350,14 +395,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { delete itr; } +#endif // ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilterWithValueChange) { do { - Options options; + Options options = CurrentOptions(); options.num_levels = 3; options.compaction_filter_factory = std::make_shared(); - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Write 100K+1 keys, these are written to a few files @@ -420,8 +465,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { PutFixed64(&three, 3); PutFixed64(&four, 4); - Options options; - options = CurrentOptions(options); + Options options = CurrentOptions(); options.create_if_missing = true; options.merge_operator = MergeOperators::CreateUInt64AddOperator(); options.num_levels = 3; @@ -481,8 +525,9 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { ASSERT_EQ(newvalue, four); } +#ifndef ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { - KeepFilterFactory* filter = new KeepFilterFactory(); + KeepFilterFactory* filter = new KeepFilterFactory(true, true); Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; @@ -504,22 +549,24 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { // be triggered. num_keys_per_file /= 2; } + dbfull()->TEST_WaitForCompact(); // Force a manual compaction cfilter_count = 0; filter->expect_manual_compaction_.store(true); - filter->expect_full_compaction_.store(false); // Manual compaction always - // set this flag. + filter->expect_full_compaction_.store(true); + filter->expect_cf_id_.store(0); dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); ASSERT_EQ(cfilter_count, 700); ASSERT_EQ(NumSortedRuns(0), 1); + ASSERT_TRUE(filter->compaction_filter_created()); // Verify total number of keys is correct after manual compaction. { int count = 0; int total = 0; Arena arena; - ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + ScopedArenaIterator iter(dbfull()->NewInternalIterator(&arena)); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { @@ -533,18 +580,48 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { iter->Next(); } ASSERT_EQ(total, 700); - ASSERT_EQ(count, 1); + ASSERT_EQ(count, 2); } } +#endif // ROCKSDB_LITE + +TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { + KeepFilterFactory* filter = new KeepFilterFactory(false, true); + filter->expect_cf_id_.store(1); + + Options options = CurrentOptions(); + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + CreateAndReopenWithCF({"pikachu"}, options); + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + Put(1, key, value); + } + Flush(1); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_TRUE(filter->compaction_filter_created()); +} + +#ifndef ROCKSDB_LITE // Compaction filters should only be applied to records that are newer than the // latest snapshot. This test inserts records and applies a delete filter. TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) { - Options options; + Options options = CurrentOptions(); options.compaction_filter_factory = std::make_shared(); options.disable_auto_compactions = true; options.create_if_missing = true; - options = CurrentOptions(options); DestroyAndReopen(options); // Put some data. @@ -573,14 +650,72 @@ TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) { ASSERT_EQ(0U, CountLiveFiles()); } +// Compaction filters should only be applied to records that are newer than the +// latest snapshot. However, if the compaction filter asks to ignore snapshots +// records newer than the snapshot will also be processed +TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { + std::string five = ToString(5); + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Put some data. + const Snapshot* snapshot = nullptr; + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + + if (table == 0) { + snapshot = db_->GetSnapshot(); + } + } + assert(snapshot != nullptr); + + cfilter_count = 0; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // The filter should delete 40 records. + ASSERT_EQ(40U, cfilter_count); + + { + // Scan the entire database as of the snapshot to ensure + // that nothing is left + ReadOptions read_options; + read_options.snapshot = snapshot; + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + int count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 6); + read_options.snapshot = 0; + std::unique_ptr iter1(db_->NewIterator(read_options)); + iter1->SeekToFirst(); + count = 0; + while (iter1->Valid()) { + count++; + iter1->Next(); + } + // We have deleted 10 keys from 40 using the compaction filter + // Keys 6-9 before the snapshot and 100-105 after the snapshot + ASSERT_EQ(count, 30); + } + + // Release the snapshot and compact again -> now all records should be + // removed. + db_->ReleaseSnapshot(snapshot); +} +#endif // ROCKSDB_LITE + } // namespace rocksdb int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/external/rocksdb/db/db_compaction_test.cc b/external/rocksdb/db/db_compaction_test.cc index e052fc7e11..6f7d78c061 100644 --- a/external/rocksdb/db/db_compaction_test.cc +++ b/external/rocksdb/db/db_compaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,26 +7,28 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/experimental.h" -#include "util/db_test_util.h" +#include "rocksdb/utilities/convenience.h" #include "util/sync_point.h" namespace rocksdb { // SYNC_POINT is not supported in released Windows mode. -#if !(defined NDEBUG) || !defined(OS_WIN) - +#if !defined(ROCKSDB_LITE) class DBCompactionTest : public DBTestBase { public: DBCompactionTest() : DBTestBase("/db_compaction_test") {} }; -class DBCompactionTestWithParam : public DBTestBase, - public testing::WithParamInterface { +class DBCompactionTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { public: DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") { - max_subcompactions_ = GetParam(); + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); } // Required if inheriting from testing::WithParamInterface<> @@ -34,43 +36,41 @@ class DBCompactionTestWithParam : public DBTestBase, static void TearDownTestCase() {} uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; }; namespace { -class OnFileDeletionListener : public EventListener { - public: - OnFileDeletionListener() : - matched_count_(0), - expected_file_name_("") {} - void SetExpectedFileName( - const std::string file_name) { - expected_file_name_ = file_name; - } +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() {} - void VerifyMatchedCount(size_t expected_value) { - ASSERT_EQ(matched_count_, expected_value); + virtual void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); } - void OnTableFileDeleted( - const TableFileDeletionInfo& info) override { - if (expected_file_name_ != "") { - ASSERT_EQ(expected_file_name_, info.file_path); - expected_file_name_ = ""; - matched_count_++; + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); } + return result; } + void ClearFlushedFiles() { flushed_files_.clear(); } + private: - size_t matched_count_; - std::string expected_file_name_; + std::vector flushed_files_; + std::mutex mutex_; }; static const int kCDTValueSize = 1000; static const int kCDTKeysPerBuffer = 4; static const int kCDTNumLevels = 8; -Options DeletionTriggerOptions() { - Options options; +Options DeletionTriggerOptions(Options options) { options.compression = kNoCompression; options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24); options.min_write_buffer_number_to_merge = 1; @@ -161,7 +161,7 @@ const SstFileMetaData* PickFileRandomly( auto result = rand->Uniform(file_id); return &(level_meta.files[result]); } - file_id -= level_meta.files.size(); + file_id -= static_cast(level_meta.files.size()); } assert(false); return nullptr; @@ -173,7 +173,7 @@ const SstFileMetaData* PickFileRandomly( TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { for (int tid = 0; tid < 3; ++tid) { uint64_t db_size[2]; - Options options = CurrentOptions(DeletionTriggerOptions()); + Options options = DeletionTriggerOptions(CurrentOptions()); options.max_subcompactions = max_subcompactions_; if (tid == 1) { @@ -212,12 +212,11 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { } TEST_F(DBCompactionTest, SkipStatsUpdateTest) { - // This test verify UpdateAccumulatedStats is not on by observing - // the compaction behavior when there are many of deletion entries. + // This test verify UpdateAccumulatedStats is not on + // if options.skip_stats_update_on_db_open = true // The test will need to be updated if the internal behavior changes. - Options options = DeletionTriggerOptions(); - options = CurrentOptions(options); + Options options = DeletionTriggerOptions(CurrentOptions()); options.env = env_; DestroyAndReopen(options); Random rnd(301); @@ -231,18 +230,17 @@ TEST_F(DBCompactionTest, SkipStatsUpdateTest) { dbfull()->TEST_WaitForFlushMemTable(); dbfull()->TEST_WaitForCompact(); - for (int k = 0; k < kTestSize; ++k) { - ASSERT_OK(Delete(Key(k))); - } - // Reopen the DB with stats-update disabled options.skip_stats_update_on_db_open = true; env_->random_file_open_counter_.store(0); Reopen(options); - // As stats-update is disabled, we expect a very low - // number of random file open. - ASSERT_LT(env_->random_file_open_counter_.load(), 5); + // As stats-update is disabled, we expect a very low number of + // random file open. + // Note that this number must be changed accordingly if we change + // the number of files needed to be opened in the DB::Open process. + const int kMaxFileOpenCount = 10; + ASSERT_LT(env_->random_file_open_counter_.load(), kMaxFileOpenCount); // Repeat the reopen process, but this time we enable // stats-update. @@ -252,12 +250,11 @@ TEST_F(DBCompactionTest, SkipStatsUpdateTest) { // Since we do a normal stats update on db-open, there // will be more random open files. - ASSERT_GT(env_->random_file_open_counter_.load(), 5); + ASSERT_GT(env_->random_file_open_counter_.load(), kMaxFileOpenCount); } TEST_F(DBCompactionTest, TestTableReaderForCompaction) { - Options options; - options = CurrentOptions(options); + Options options = CurrentOptions(); options.env = env_; options.new_table_reader_for_compaction_inputs = true; options.max_open_files = 100; @@ -346,7 +343,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { for (int tid = 0; tid < 2; ++tid) { uint64_t db_size[3]; - Options options = CurrentOptions(DeletionTriggerOptions()); + Options options = DeletionTriggerOptions(CurrentOptions()); options.max_subcompactions = max_subcompactions_; if (tid == 1) { @@ -403,7 +400,7 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { uint64_t db_size[3]; for (int test = 0; test < 2; ++test) { - Options options = CurrentOptions(DeletionTriggerOptions()); + Options options = DeletionTriggerOptions(CurrentOptions()); options.skip_stats_update_on_db_open = (test == 0); env_->random_read_counter_.Reset(); @@ -459,13 +456,15 @@ TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { TEST_P(DBCompactionTestWithParam, CompactionTrigger) { - Options options; + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; options.num_levels = 3; options.level0_file_num_compaction_trigger = 3; options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); @@ -474,31 +473,131 @@ TEST_P(DBCompactionTestWithParam, CompactionTrigger) { num++) { std::vector values; // Write 100KB (100 values, each 1K) - for (int i = 0; i < 100; i++) { + for (int i = 0; i < kNumKeysPerFile; i++) { values.push_back(RandomString(&rnd, 990)); ASSERT_OK(Put(1, Key(i), values[i])); } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); dbfull()->TEST_WaitForFlushMemTable(handles_[1]); ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } // generate one more file in level-0, and should trigger level-0 compaction std::vector values; - for (int i = 0; i < 100; i++) { + for (int i = 0; i < kNumKeysPerFile; i++) { values.push_back(RandomString(&rnd, 990)); ASSERT_OK(Put(1, Key(i), values[i])); } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); dbfull()->TEST_WaitForCompact(); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); } +TEST_F(DBCompactionTest, BGCompactionsAllowed) { + // Create several column families. Make compaction triggers in all of them + // and see number of compactions scheduled to be less than allowed. + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + // Should speed up compaction when there are 4 files. + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 20; + options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large + options.base_background_compactions = 1; + options.max_background_compactions = 3; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + + // Block all threads in thread pool. + const size_t kTotalTasks = 4; + env_->SetBackgroundThreads(4, Env::LOW); + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + for (size_t i = 0; i < kTotalTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + + CreateAndReopenWithCF({"one", "two", "three"}, options); + + Random rnd(301); + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Create two more files for one column family, which triggers speed up + // condition, three compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(2, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(2, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 2)); + } + ASSERT_EQ(3, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Unblock all threads to unblock all compactions. + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + dbfull()->TEST_WaitForCompact(); + + // Verify number of compactions allowed will come back to 1. + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } +} + TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000000; // Large write buffer options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); @@ -525,9 +624,8 @@ TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { TEST_F(DBCompactionTest, MinorCompactionsHappen) { do { - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 10000; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); const int N = 500; @@ -551,14 +649,132 @@ TEST_F(DBCompactionTest, MinorCompactionsHappen) { } while (ChangeCompactOptions()); } +TEST_F(DBCompactionTest, UserKeyCrossFile1) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + Put("4", "A"); + Put("3", "A"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + Put("2", "A"); + Delete("3"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + Put("2", "B"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile2) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + Put("4", "A"); + Put("3", "A"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + Put("2", "A"); + SingleDelete("3"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + Put("2", "B"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + // compaction options + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + compact_opt.output_file_size_limit = 4096; + const size_t key_len = + static_cast(compact_opt.output_file_size_limit) / 5; + + DestroyAndReopen(options); + + std::vector snaps; + + // create first file and flush to l0 + for (auto& key : {"1", "2", "3", "3", "3", "3"}) { + Put(key, std::string(key_len, 'A')); + snaps.push_back(dbfull()->GetSnapshot()); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + // create second file and flush to l0 + for (auto& key : {"3", "4", "5", "6", "7", "8"}) { + Put(key, std::string(key_len, 'A')); + snaps.push_back(dbfull()->GetSnapshot()); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + // move both files down to l1 + dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1); + + // release snap so that first instance of key(3) can have seqId=0 + for (auto snap : snaps) { + dbfull()->ReleaseSnapshot(snap); + } + + // create 3 files in l0 so to trigger compaction + for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { + Put("2", std::string(1, 'A')); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + + dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Put("", "")); +} + // Check that writes done during a memtable compaction are recovered // if the database is shutdown during the memtable compaction. TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) { do { - Options options; + Options options = CurrentOptions(); options.env = env_; - options.write_buffer_size = 1000000; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Trigger a long memtable compaction and reopen the database during it @@ -582,10 +798,9 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { [&](void* arg) { trivial_move++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000000; options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); DestroyAndReopen(options); int32_t num_keys = 80; @@ -608,8 +823,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { ASSERT_EQ(metadata.size(), 1U); LiveFileMetaData level0_file = metadata[0]; // L0 file meta + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + // Compaction will initiate a trivial move from L0 to L1 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + dbfull()->CompactRange(cro, nullptr, nullptr); // File moved From L0 to L1 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0 @@ -661,7 +879,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { Random rnd(301); std::map values; - for (uint32_t i = 0; i < ranges.size(); i++) { + for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { values[j] = RandomString(&rnd, value_size); ASSERT_OK(Put(Key(j), values[j])); @@ -673,14 +891,17 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0 ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + // Since data is non-overlapping we expect compaction to initiate // a trivial move - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + db_->CompactRange(cro, nullptr, nullptr); // We expect that all the files were trivially moved from L0 to L1 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files); - for (uint32_t i = 0; i < ranges.size(); i++) { + for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { ASSERT_EQ(Get(Key(j)), values[j]); } @@ -704,7 +925,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { {500, 560}, // this range overlap with the next one {551, 599}, }; - for (uint32_t i = 0; i < ranges.size(); i++) { + for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { values[j] = RandomString(&rnd, value_size); ASSERT_OK(Put(Key(j), values[j])); @@ -712,9 +933,9 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { ASSERT_OK(Flush()); } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + db_->CompactRange(cro, nullptr, nullptr); - for (uint32_t i = 0; i < ranges.size(); i++) { + for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { ASSERT_EQ(Get(Key(j)), values[j]); } @@ -768,6 +989,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); // 2 files in L6 ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); @@ -783,6 +1005,391 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) { } } +TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* arg) { trivial_move++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* arg) { non_trivial_move++; }); + bool first = true; + // Purpose of dependencies: + // 4 -> 1: ensure the order of two non-trivial compactions + // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions + // are installed + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (first) { + first = false; + TEST_SYNC_POINT("DBCompaction::ManualPartial:4"); + TEST_SYNC_POINT("DBCompaction::ManualPartial:3"); + } else { // second non-trivial compaction + TEST_SYNC_POINT("DBCompaction::ManualPartial:2"); + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + options.target_file_size_base = 1 << 23; // 8 MB + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + // Trivial move the two non-overlapping files to level 6 + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 1 files in L0 + ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false)); + // 2 files in L6, 1 file in L5 + ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 6); + ASSERT_EQ(non_trivial_move, 0); + + std::thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + // First non-trivial compaction is triggered + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::ManualPartial:1"); + // file 4 [300 => 400) + for (int32_t i = 300; i <= 400; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 5 [400 => 500) + for (int32_t i = 400; i <= 500; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 6 [500 => 600) + for (int32_t i = 500; i <= 600; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + // Second non-trivial compaction is triggered + ASSERT_OK(Flush()); + + // Before two non-trivial compactions are installed, there are 3 files in L0 + ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0)); + TEST_SYNC_POINT("DBCompaction::ManualPartial:5"); + + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + // After two non-trivial compactions are installed, there is 1 file in L6, and + // 1 file in L1 + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0)); + threads.join(); + + for (int32_t i = 0; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, ManualPartialFill) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* arg) { trivial_move++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* arg) { non_trivial_move++; }); + bool first = true; + bool second = true; + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"}, + {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) { + if (first) { + TEST_SYNC_POINT("DBCompaction::PartialFill:4"); + first = false; + TEST_SYNC_POINT("DBCompaction::PartialFill:3"); + } else if (second) { + } + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L2, 1 in L0 + ASSERT_EQ("1,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + // 2 files in L2, 1 in L1 + ASSERT_EQ("0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 2); + ASSERT_EQ(non_trivial_move, 0); + + std::thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::PartialFill:1"); + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + } + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size *= options.max_bytes_for_level_multiplier; + } + + TEST_SYNC_POINT("DBCompaction::PartialFill:2"); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + threads.join(); + + for (int32_t i = 0; i < 4300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, DeleteFileRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + } + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size *= options.max_bytes_for_level_multiplier; + } + + size_t old_num_files = CountFiles(); + std::string begin_string = Key(1000); + std::string end_string = Key(2000); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + + int32_t deleted_count = 0; + for (int32_t i = 0; i < 4300; i++) { + if (i < 1000 || i > 2000) { + ASSERT_EQ(Get(Key(i)), values[i]); + } else { + ReadOptions roptions; + std::string result; + Status s = db_->Get(roptions, Key(i), &result); + ASSERT_TRUE(s.IsNotFound() || s.ok()); + if (s.IsNotFound()) { + deleted_count++; + } + } + } + ASSERT_GT(deleted_count, 0); + begin_string = Key(5000); + end_string = Key(6000); + Slice begin1(begin_string); + Slice end1(end_string); + // Try deleting files in range which contain no keys + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1)); + + // Push data from level 0 to level 1 to force all data to be deleted + // Note that we don't delete level 0 files + compact_options.change_level = true; + compact_options.target_level = 1; + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); + + int32_t deleted_count2 = 0; + for (int32_t i = 0; i < 4300; i++) { + ReadOptions roptions; + std::string result; + Status s = db_->Get(roptions, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + deleted_count2++; + } + ASSERT_GT(deleted_count2, deleted_count); + size_t new_num_files = CountFiles(); + ASSERT_GT(old_num_files, new_num_files); +} + TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { int32_t trivial_move = 0; int32_t non_trivial_move = 0; @@ -794,10 +1401,9 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { [&](void* arg) { non_trivial_move++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000000; options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); DestroyAndReopen(options); int32_t value_size = 10 * 1024; // 10 KB @@ -816,6 +1422,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 3; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); ASSERT_EQ(trivial_move, 1); @@ -829,8 +1436,10 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { ASSERT_OK(Flush()); ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); ASSERT_EQ(trivial_move, 4); ASSERT_EQ(non_trivial_move, 0); @@ -847,6 +1456,8 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) { options.db_paths.emplace_back(dbname_, 500 * 1024); options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -962,6 +1573,8 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) { options.db_paths.emplace_back(dbname_, 500 * 1024); options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -1079,7 +1692,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { int max_key_universal_insert = 600; // Stage 1: generate a db with level compaction - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; options.num_levels = 4; @@ -1089,7 +1702,6 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { options.target_file_size_base = 200 << 10; // 200KB options.target_file_size_multiplier = 1; options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); for (int i = 0; i <= max_key_level_insert; i++) { @@ -1130,6 +1742,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { compact_options.target_level = 0; compact_options.bottommost_level_compaction = BottommostLevelCompaction::kForce; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); // Only 1 file in L0 @@ -1228,6 +1841,7 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { TEST_P(DBCompactionTestWithParam, ManualCompaction) { Options options = CurrentOptions(); options.max_subcompactions = max_subcompactions_; + options.statistics = rocksdb::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); // iter - 0 with 7 levels @@ -1259,7 +1873,16 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) { // Compact all MakeTables(1, "a", "z", 1); ASSERT_EQ("1,0,2", FilesPerLevel(1)); - db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + + uint64_t prev_block_cache_add = + options.statistics->getTickerCount(BLOCK_CACHE_ADD); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + db_->CompactRange(cro, handles_[1], nullptr, nullptr); + // Verify manual compaction doesn't fill block cache + ASSERT_EQ(prev_block_cache_add, + options.statistics->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); if (iter == 0) { @@ -1267,6 +1890,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) { options.max_background_flushes = 0; options.num_levels = 3; options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); } @@ -1333,6 +1957,7 @@ TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) { ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); CompactRangeOptions compact_options; compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); ASSERT_EQ("0,1", FilesPerLevel(1)); @@ -1401,7 +2026,7 @@ TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { std::set overlapping_file_names; std::vector compaction_input_file_names; for (int f = 0; f < file_picked; ++f) { - int level; + int level = 0; auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); compaction_input_file_names.push_back(file_meta->name); GetOverlappingFileNumbersForLevelCompaction( @@ -1593,6 +2218,8 @@ TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) { return; } Options options = CurrentOptions(); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -1699,6 +2326,25 @@ TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) { Destroy(options); } +TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) { + Options options = CurrentOptions(); + options.max_background_compactions = 5; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(5, db_->GetOptions().base_background_compactions); + ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit); + + options.base_background_compactions = 4; + options.max_background_compactions = 3; + options.soft_pending_compaction_bytes_limit = 200; + options.hard_pending_compaction_bytes_limit = 150; + DestroyAndReopen(options); + ASSERT_EQ(3, db_->GetOptions().base_background_compactions); + ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit); +} + // This tests for a bug that could cause two level0 compactions running // concurrently // TODO(aekmekji): Make sure that the reason this fails when run with @@ -1770,10 +2416,9 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { [&](void* arg) { non_trivial_move++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000000; options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); DestroyAndReopen(options); int32_t value_size = 10 * 1024; // 10 KB @@ -1843,12 +2488,66 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { } INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam, - ::testing::Values(1, 4)); -#endif // !(defined NDEBUG) || !defined(OS_WIN) + ::testing::Values(std::make_tuple(1, true), + std::make_tuple(1, false), + std::make_tuple(4, true), + std::make_tuple(4, false))); + +class CompactionPriTest : public DBTestBase, + public testing::WithParamInterface { + public: + CompactionPriTest() : DBTestBase("/compaction_pri_test") { + compaction_pri_ = GetParam(); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t compaction_pri_; +}; + +TEST_P(CompactionPriTest, Test) { + Options options = CurrentOptions(); + options.write_buffer_size = 16 * 1024; + options.compaction_pri = static_cast(compaction_pri_); + options.hard_pending_compaction_bytes_limit = 256 * 1024; + options.max_bytes_for_level_base = 64 * 1024; + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + + DestroyAndReopen(options); + + Random rnd(301); + const int kNKeys = 5000; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + for (int i = 0; i < kNKeys; i++) { + ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102))); + } + + dbfull()->TEST_WaitForCompact(); + for (int i = 0; i < kNKeys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } +} + +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio)); + +#endif // !defined(ROCKSDB_LITE) } // namespace rocksdb int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/external/rocksdb/db/db_dynamic_level_test.cc b/external/rocksdb/db/db_dynamic_level_test.cc index f4d2b81999..6783c58d00 100644 --- a/external/rocksdb/db/db_dynamic_level_test.cc +++ b/external/rocksdb/db/db_dynamic_level_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,10 +10,10 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) +#include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" namespace rocksdb { class DBTestDynamicLevel : public DBTestBase { @@ -125,15 +125,15 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { Options options = CurrentOptions(); options.create_if_missing = true; - options.db_write_buffer_size = 2048; - options.write_buffer_size = 2048; + options.db_write_buffer_size = 204800; + options.write_buffer_size = 20480; options.max_write_buffer_number = 2; options.level0_file_num_compaction_trigger = 2; options.level0_slowdown_writes_trigger = 9999; options.level0_stop_writes_trigger = 9999; - options.target_file_size_base = 2048; + options.target_file_size_base = 9102; options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_base = 40960; options.max_bytes_for_level_multiplier = 4; options.max_background_compactions = 2; options.num_levels = 5; @@ -154,10 +154,10 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(4U, int_prop); - // Put about 7K to L0 + // Put about 28K to L0 for (int i = 0; i < 70; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); + RandomString(&rnd, 380))); } ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, @@ -167,14 +167,14 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(4U, int_prop); - // Insert extra about 3.5K to L0. After they are compacted to L4, base level + // Insert extra about 28K to L0. After they are compacted to L4, base level // should be changed to L3. ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "true"}, })); for (int i = 0; i < 70; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); + RandomString(&rnd, 380))); } ASSERT_OK(dbfull()->SetOptions({ @@ -199,10 +199,10 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "true"}, })); - // Write about 10K more + // Write about 40K more for (int i = 0; i < 100; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); + RandomString(&rnd, 380))); } ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, @@ -218,15 +218,15 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { // Trigger a condition that the compaction changes base level and L0->Lbase // happens at the same time. - // We try to make last levels' targets to be 10K, 40K, 160K, add triggers + // We try to make last levels' targets to be 40K, 160K, 640K, add triggers // another compaction from 40K->160K. ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "true"}, })); - // Write about 150K more - for (int i = 0; i < 1350; i++) { + // Write about 600K more + for (int i = 0; i < 1500; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); + RandomString(&rnd, 380))); } ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, @@ -236,30 +236,40 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(2U, int_prop); + // A manual compaction will trigger the base level to become L2 // Keep Writing data until base level changed 2->1. There will be L0->L2 // compaction going on at the same time. + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"}, + {"DynamicLevelMaxBytesBase2:1", "CompactionJob::Run():End"}, + {"DynamicLevelMaxBytesBase2:compact_range_finish", + "FlushJob::WriteLevel0Table"}, + }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - for (int attempt = 0; attempt <= 20; attempt++) { - // Write about 5K more data with two flushes. It should be flush to level 2 - // but when it is applied, base level is already 1. - for (int i = 0; i < 50; i++) { - ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); - } - Flush(); - ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); - if (int_prop == 2U) { - env_->SleepForMicroseconds(50000); - } else { - break; - } + std::thread thread([this] { + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish"); + }); + + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0"); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 380))); } + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1"); + + Flush(); + + thread.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - env_->SleepForMicroseconds(200000); - ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(1U, int_prop); } @@ -484,10 +494,10 @@ TEST_F(DBTestDynamicLevel, MigrateToDynamicLevelMaxBytesBase) { } } // namespace rocksdb -#endif // !(defined NDEBUG) || !defined(OS_WIN) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/external/rocksdb/db/db_filesnapshot.cc b/external/rocksdb/db/db_filesnapshot.cc index e39ccf4964..7ed1f1d363 100644 --- a/external/rocksdb/db/db_filesnapshot.cc +++ b/external/rocksdb/db/db_filesnapshot.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,19 +14,20 @@ #endif #include +#include #include #include -#include #include "db/db_impl.h" #include "db/filename.h" #include "db/job_context.h" #include "db/version_set.h" +#include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "port/port.h" +#include "util/file_util.h" #include "util/mutexlock.h" #include "util/sync_point.h" -#include "util/file_util.h" +#include "util/testharness.h" namespace rocksdb { @@ -83,7 +84,6 @@ int DBImpl::IsFileDeletionsEnabled() const { Status DBImpl::GetLiveFiles(std::vector& ret, uint64_t* manifest_file_size, bool flush_memtable) { - *manifest_file_size = 0; mutex_.Lock(); @@ -126,7 +126,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, } ret.clear(); - ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST + ret.reserve(live.size() + 3); // *.sst + CURRENT + MANIFEST + OPTIONS // create names of the live files. The names are not absolute // paths, instead they are relative to dbname_; @@ -136,6 +136,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, ret.push_back(CurrentFileName("")); ret.push_back(DescriptorFileName("", versions_->manifest_file_number())); + ret.push_back(OptionsFileName("", versions_->options_file_number())); // find length of manifest file while holding the mutex lock *manifest_file_size = versions_->manifest_file_size(); diff --git a/external/rocksdb/db/db_flush_test.cc b/external/rocksdb/db/db_flush_test.cc new file mode 100644 index 0000000000..ab4b1ab4c3 --- /dev/null +++ b/external/rocksdb/db/db_flush_test.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "util/sync_point.h" + +namespace rocksdb { + +class DBFlushTest : public DBTestBase { + public: + DBFlushTest() : DBTestBase("/db_flush_test") {} +}; + +// We had issue when two background threads trying to flush at the same time, +// only one of them get committed. The test verifies the issue is fixed. +TEST_F(DBFlushTest, FlushWhileWritingManifest) { + Options options; + options.disable_auto_compactions = true; + options.max_background_flushes = 2; + Reopen(options); + FlushOptions no_wait; + no_wait.wait = false; + + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifest", + "DBFlushTest::FlushWhileWritingManifest:1"}, + {"MemTableList::InstallMemtableFlushResults:InProgress", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("foo", "v")); + ASSERT_OK(dbfull()->Flush(no_wait)); + TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1"); + ASSERT_OK(Put("bar", "v")); + ASSERT_OK(dbfull()->Flush(no_wait)); + // If the issue is hit we will wait here forever. + dbfull()->TEST_WaitForFlushMemTable(); +#ifndef ROCKSDB_LITE + ASSERT_EQ(2, TotalTableFiles()); +#endif // ROCKSDB_LITE +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_impl.cc b/external/rocksdb/db/db_impl.cc index cf4fa74b35..f750ef8390 100644 --- a/external/rocksdb/db/db_impl.cc +++ b/external/rocksdb/db/db_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,19 +6,24 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - #include "db/db_impl.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif - #include #include +#ifdef OS_SOLARIS +#include +#endif +#ifdef ROCKSDB_JEMALLOC +#include "jemalloc/jemalloc.h" +#endif #include #include #include +#include #include #include #include @@ -27,8 +32,10 @@ #include #include +#include "db/auto_roll_logger.h" #include "db/builder.h" #include "db/compaction_job.h" +#include "db/db_info_dumper.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/event_helpers.h" @@ -49,41 +56,42 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "db/write_callback.h" -#include "db/writebuffer.h" +#include "db/xfunc_test_points.h" +#include "memtable/hash_linklist_rep.h" +#include "memtable/hash_skiplist_rep.h" #include "port/likely.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" -#include "rocksdb/sst_file_writer.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/version.h" +#include "rocksdb/wal_filter.h" +#include "rocksdb/write_buffer_manager.h" #include "table/block.h" #include "table/block_based_table_factory.h" #include "table/merger.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" -#include "util/auto_roll_logger.h" #include "util/autovector.h" #include "util/build_version.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/db_info_dumper.h" #include "util/file_reader_writer.h" #include "util/file_util.h" -#include "util/hash_linklist_rep.h" -#include "util/hash_skiplist_rep.h" #include "util/iostats_context_imp.h" #include "util/log_buffer.h" #include "util/logging.h" #include "util/mutexlock.h" +#include "util/options_helper.h" +#include "util/options_parser.h" #include "util/perf_context_imp.h" +#include "util/sst_file_manager_impl.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/sync_point.h" @@ -132,13 +140,22 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } if (result.info_log == nullptr) { - Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env, - result, &result.info_log); + Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); if (!s.ok()) { // No place suitable for logging result.info_log = nullptr; } } + if (!result.write_buffer_manager) { + result.write_buffer_manager.reset( + new WriteBufferManager(result.db_write_buffer_size)); + } + if (result.base_background_compactions == -1) { + result.base_background_compactions = result.max_background_compactions; + } + if (result.base_background_compactions > result.max_background_compactions) { + result.base_background_compactions = result.max_background_compactions; + } result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions, Env::Priority::LOW); result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes, @@ -150,6 +167,21 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } } + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + + if (result.recycle_log_file_num && + (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { + // kPointInTimeRecovery is indistinguishable from + // kTolerateCorruptedTailRecords in recycle mode since we define + // the "end" of the log as the first corrupt record we encounter. + // kAbsoluteConsistency doesn't make sense because even a clean + // shutdown leaves old junk at the end of the log file. + result.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + } + if (result.wal_dir.empty()) { // Use dbname as default result.wal_dir = dbname; @@ -166,6 +198,13 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { result.new_table_reader_for_compaction_inputs = true; } + // Force flush on DB open if 2PC is enabled, since with 2PC we have no + // guarantee that consecutive log files have consecutive sequence id, which + // make recovery complicated. + if (result.allow_2pc) { + result.avoid_flush_during_recovery = false; + } + return result; } @@ -184,7 +223,47 @@ Status SanitizeOptionsByTable( return Status::OK(); } -CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { +static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families) { + Status s; + + for (auto& cfd : column_families) { + s = CheckCompressionSupported(cfd.options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cfd.options); + } + if (!s.ok()) { + return s; + } + if (db_options.db_paths.size() > 1) { + if ((cfd.options.compaction_style != kCompactionStyleUniversal) && + (cfd.options.compaction_style != kCompactionStyleLevel)) { + return Status::NotSupported( + "More than one DB paths are only supported in " + "universal and level compaction styles. "); + } + } + } + + if (db_options.db_paths.size() > 4) { + return Status::NotSupported( + "More than four DB paths are not supported yet. "); + } + + if (db_options.allow_mmap_reads && !db_options.allow_os_buffer) { + // Protect against assert in PosixMMapReadableFile constructor + return Status::NotSupported( + "If memory mapped reads (allow_mmap_reads) are enabled " + "then os caching (allow_os_buffer) must also be enabled. "); + } + + return Status::OK(); +} + +CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) { // Compressing memtable flushes might not help unless the sequential load // optimization is used for leveled compaction. Otherwise the CPU and // latency overhead is not offset by saving much space. @@ -201,7 +280,7 @@ CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { } if (can_compress) { - return ioptions.compression; + return mutable_cf_options.compression; } else { return kNoCompression; } @@ -239,28 +318,34 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) total_log_size_(0), max_total_in_memory_state_(0), is_snapshot_supported_(true), - write_buffer_(options.db_write_buffer_size), + write_buffer_manager_(db_options_.write_buffer_manager.get()), + write_thread_(options.enable_write_thread_adaptive_yield + ? options.write_thread_max_yield_usec + : 0, + options.write_thread_slow_yield_usec), write_controller_(options.delayed_write_rate), last_batch_group_size_(0), unscheduled_flushes_(0), unscheduled_compactions_(0), bg_compaction_scheduled_(0), - bg_manual_only_(0), + num_running_compactions_(0), bg_flush_scheduled_(0), - manual_compaction_(nullptr), + num_running_flushes_(0), + bg_purge_scheduled_(0), disable_delete_obsolete_files_(0), delete_obsolete_files_next_run_( options.env->NowMicros() + db_options_.delete_obsolete_files_period_micros), last_stats_dump_time_microsec_(0), next_job_id_(1), - flush_on_destroy_(false), + has_unpersisted_data_(false), env_options_(db_options_), #ifndef ROCKSDB_LITE wal_manager_(db_options_, env_options_), #endif // ROCKSDB_LITE event_logger_(db_options_.info_log.get()), bg_work_paused_(0), + bg_compaction_paused_(0), refitting_level_(false), opened_successfully_(false) { env_->GetAbsolutePath(dbname, &db_absolute_path_); @@ -273,10 +358,10 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits); versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_, + table_cache_.get(), write_buffer_manager_, &write_controller_)); - column_family_memtables_.reset(new ColumnFamilyMemTablesImpl( - versions_->GetColumnFamilySet(), &flush_scheduler_)); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); DumpRocksDBBuildVersion(db_options_.info_log.get()); DumpDBFileSummary(db_options_, dbname_); @@ -301,7 +386,8 @@ void DBImpl::CancelAllBackgroundWork(bool wait) { DBImpl::~DBImpl() { mutex_.Lock(); - if (!shutting_down_.load(std::memory_order_acquire) && flush_on_destroy_) { + if (!shutting_down_.load(std::memory_order_acquire) && + has_unpersisted_data_) { for (auto cfd : *versions_->GetColumnFamilySet()) { if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) { cfd->Ref(); @@ -325,7 +411,9 @@ DBImpl::~DBImpl() { bg_flush_scheduled_ -= flushes_unscheduled; // Wait for background work to finish - while (bg_compaction_scheduled_ || bg_flush_scheduled_) { + while (bg_compaction_scheduled_ || bg_flush_scheduled_ || + bg_purge_scheduled_) { + TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob"); bg_cv_.Wait(); } EraseThreadStatusDbInfo(); @@ -382,6 +470,25 @@ DBImpl::~DBImpl() { } logs_.clear(); + // Table cache may have table handles holding blocks from the block cache. + // We need to release them before the block cache is destroyed. The block + // cache may be destroyed inside versions_.reset(), when column family data + // list is destroyed, so leaving handles in table cache after + // versions_.reset() may cause issues. + // Here we clean all unreferenced handles in table cache. + // Now we assume all user queries have finished, so only version set itself + // can possibly hold the blocks from block cache. After releasing unreferenced + // handles here, only handles held by version set left and inside + // versions_.reset(), we will release them. There, we need to make sure every + // time a handle is released, we erase it from the cache too. By doing that, + // we can guarantee that after versions_.reset(), table cache is empty + // so the cache can be safely destroyed. + table_cache_->EraseUnRefEntries(); + + for (auto& txn_entry : recovered_transactions_) { + delete txn_entry.second; + } + // versions need to be destroyed before table_cache since it can hold // references to table_cache. versions_.reset(); @@ -407,14 +514,14 @@ Status DBImpl::NewDB() { { unique_ptr file; EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); - s = env_->NewWritableFile(manifest, &file, env_options); + s = NewWritableFile(env_, manifest, &file, env_options); if (!s.ok()) { return s; } file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options)); - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); @@ -458,6 +565,39 @@ void DBImpl::PrintStatistics() { } } +#ifndef ROCKSDB_LITE +#ifdef ROCKSDB_JEMALLOC +typedef struct { + char* cur; + char* end; +} MallocStatus; + +static void GetJemallocStatus(void* mstat_arg, const char* status) { + MallocStatus* mstat = reinterpret_cast(mstat_arg); + size_t status_len = status ? strlen(status) : 0; + size_t buf_size = (size_t)(mstat->end - mstat->cur); + if (!status_len || status_len > buf_size) { + return; + } + + snprintf(mstat->cur, buf_size, "%s", status); + mstat->cur += status_len; +} +#endif // ROCKSDB_JEMALLOC + +static void DumpMallocStats(std::string* stats) { +#ifdef ROCKSDB_JEMALLOC + MallocStatus mstat; + const uint kMallocStatusLen = 1000000; + std::unique_ptr buf{new char[kMallocStatusLen + 1]}; + mstat.cur = buf.get(); + mstat.end = buf.get() + kMallocStatusLen; + malloc_stats_print(GetJemallocStatus, &mstat, ""); + stats->append(buf.get()); +#endif // ROCKSDB_JEMALLOC +} +#endif // !ROCKSDB_LITE + void DBImpl::MaybeDumpStats() { if (db_options_.stats_dump_period_sec == 0) return; @@ -473,23 +613,25 @@ void DBImpl::MaybeDumpStats() { last_stats_dump_time_microsec_ = now_micros; #ifndef ROCKSDB_LITE - bool tmp1 = false; - bool tmp2 = false; - DBPropertyType cf_property_type = - GetPropertyType(DB::Properties::kCFStats, &tmp1, &tmp2); - DBPropertyType db_property_type = - GetPropertyType(DB::Properties::kDBStats, &tmp1, &tmp2); + const DBPropertyInfo* cf_property_info = + GetPropertyInfo(DB::Properties::kCFStats); + assert(cf_property_info != nullptr); + const DBPropertyInfo* db_property_info = + GetPropertyInfo(DB::Properties::kDBStats); + assert(db_property_info != nullptr); + std::string stats; { InstrumentedMutexLock l(&mutex_); for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->internal_stats()->GetStringProperty(cf_property_type, - DB::Properties::kCFStats, - &stats); + cfd->internal_stats()->GetStringProperty( + *cf_property_info, DB::Properties::kCFStats, &stats); } - default_cf_internal_stats_->GetStringProperty(db_property_type, - DB::Properties::kDBStats, - &stats); + default_cf_internal_stats_->GetStringProperty( + *db_property_info, DB::Properties::kDBStats, &stats); + } + if (db_options_.dump_malloc_stats) { + DumpMallocStats(&stats); } Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "------- DUMPING STATS -------"); @@ -501,6 +643,89 @@ void DBImpl::MaybeDumpStats() { } } +uint64_t DBImpl::FindMinPrepLogReferencedByMemTable() { + uint64_t min_log = 0; + + // we must look through the memtables for two phase transactions + // that have been committed but not yet flushed + for (auto loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +void DBImpl::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prep_heap_mutex_); + auto it = prepared_section_completed_.find(log); + assert(it != prepared_section_completed_.end()); + it->second += 1; +} + +void DBImpl::MarkLogAsContainingPrepSection(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prep_heap_mutex_); + min_log_with_prep_.push(log); + auto it = prepared_section_completed_.find(log); + if (it == prepared_section_completed_.end()) { + prepared_section_completed_[log] = 0; + } +} + +uint64_t DBImpl::FindMinLogContainingOutstandingPrep() { + std::lock_guard lock(prep_heap_mutex_); + uint64_t min_log = 0; + + // first we look in the prepared heap where we keep + // track of transactions that have been prepared (written to WAL) + // but not yet committed. + while (!min_log_with_prep_.empty()) { + min_log = min_log_with_prep_.top(); + + auto it = prepared_section_completed_.find(min_log); + + // value was marked as 'deleted' from heap + if (it != prepared_section_completed_.end() && it->second > 0) { + it->second -= 1; + min_log_with_prep_.pop(); + + // back to squere one... + min_log = 0; + continue; + } else { + // found a valid value + break; + } + } + + return min_log; +} + +void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { + if (!job_context->logs_to_free.empty()) { + for (auto l : job_context->logs_to_free) { + AddToLogsToFreeQueue(l); + } + job_context->logs_to_free.clear(); + SchedulePurge(); + } +} + // * Returns the list of live files in 'sst_live' // If it's doing full scan: // * Returns the list of all files in the filesystem in @@ -550,6 +775,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // Get obsolete files. This function will also update the list of // pending files in VersionSet(). versions_->GetObsoleteFiles(&job_context->sst_delete_files, + &job_context->manifest_delete_files, job_context->min_pending_output); // store the current filenum, lognum, etc @@ -557,12 +783,37 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, job_context->pending_manifest_file_number = versions_->pending_manifest_file_number(); job_context->log_number = versions_->MinLogNumber(); + + if (allow_2pc()) { + // if are 2pc we must consider logs containing prepared + // sections of outstanding transactions. + // + // We must check min logs with outstanding prep before we check + // logs referneces by memtables because a log referenced by the + // first data structure could transition to the second under us. + // + // TODO(horuff): iterating over all column families under db mutex. + // should find more optimial solution + auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < job_context->log_number) { + job_context->log_number = min_log_in_prep_heap; + } + + auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < job_context->log_number) { + job_context->log_number = min_log_refed_by_mem; + } + } + job_context->prev_log_number = versions_->prev_log_number(); versions_->AddLiveFiles(&job_context->sst_live); if (doing_the_full_scan) { - for (uint32_t path_id = 0; path_id < db_options_.db_paths.size(); - path_id++) { + for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; @@ -570,8 +821,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, &files); // Ignore errors for (std::string file : files) { // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes - job_context->full_scan_candidate_files.emplace_back("/" + file, - path_id); + job_context->full_scan_candidate_files.emplace_back( + "/" + file, static_cast(path_id)); } } @@ -595,11 +846,23 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } if (!alive_log_files_.empty()) { - uint64_t min_log_number = versions_->MinLogNumber(); + uint64_t min_log_number = job_context->log_number; + size_t num_alive_log_files = alive_log_files_.size(); // find newly obsoleted log files while (alive_log_files_.begin()->number < min_log_number) { auto& earliest = *alive_log_files_.begin(); - job_context->log_delete_files.push_back(earliest.number); + if (db_options_.recycle_log_file_num > log_recycle_files.size()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", earliest.number); + log_recycle_files.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } + if (job_context->size_log_to_delete == 0) { + job_context->prev_total_log_size = total_log_size_; + job_context->num_alive_log_files = num_alive_log_files; + } + job_context->size_log_to_delete += earliest.size; total_log_size_ -= earliest.size; alive_log_files_.pop_front(); // Current log should always stay alive since it can't have @@ -639,11 +902,42 @@ bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, } }; // namespace +// Delete obsolete files and log status and information of file deletion +void DBImpl::DeleteObsoleteFileImpl(Status file_deletion_status, int job_id, + const std::string& fname, FileType type, + uint64_t number, uint32_t path_id) { + if (type == kTableFile) { + file_deletion_status = DeleteSSTFile(&db_options_, fname, path_id); + } else { + file_deletion_status = env_->DeleteFile(fname); + } + if (file_deletion_status.ok()) { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, file_deletion_status.ToString().c_str()); + } else if (env_->FileExists(fname).IsNotFound()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 + " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, file_deletion_status.ToString().c_str()); + } + if (type == kTableFile) { + EventHelpers::LogAndNotifyTableFileDeletion( + &event_logger_, job_id, number, fname, file_deletion_status, GetName(), + db_options_.listeners); + } +} + // Diffs the files listed in filenames and those that do not // belong to live files are posibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. -void DBImpl::PurgeObsoleteFiles(const JobContext& state) { +void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) { // we'd better have sth to delete assert(state.HaveSomethingToDelete()); @@ -662,9 +956,9 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { } auto candidate_files = state.full_scan_candidate_files; - candidate_files.reserve(candidate_files.size() + - state.sst_delete_files.size() + - state.log_delete_files.size()); + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.log_delete_files.size() + state.manifest_delete_files.size()); // We may ignore the dbname when generating the file names. const char* kDumbDbName = ""; for (auto file : state.sst_delete_files) { @@ -680,12 +974,26 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { 0); } } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, 0); + } // dedup state.candidate_files so we don't try to delete the same // file twice - sort(candidate_files.begin(), candidate_files.end(), CompareCandidateFile); - candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()), - candidate_files.end()); + std::sort(candidate_files.begin(), candidate_files.end(), + CompareCandidateFile); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + if (state.prev_total_log_size > 0) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[JOB %d] Try to delete WAL files size %" PRIu64 + ", prev total WAL file size %" PRIu64 + ", number of live WAL files %" ROCKSDB_PRIszt ".\n", + state.job_id, state.size_log_to_delete, state.prev_total_log_size, + state.num_alive_log_files); + } std::vector old_info_log_files; InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); @@ -722,8 +1030,12 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { // Also, SetCurrentFile creates a temp file when writing out new // manifest, which is equal to state.pending_manifest_file_number. We // should not delete that file + // + // TODO(yhchiang): carefully modify the third condition to safely + // remove the temp options files. keep = (sst_live_map.find(number) != sst_live_map.end()) || - (number == state.pending_manifest_file_number); + (number == state.pending_manifest_file_number) || + (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); break; case kInfoLogFile: keep = true; @@ -735,6 +1047,7 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { case kDBLockFile: case kIdentityFile: case kMetaDatabase: + case kOptionsFile: keep = true; break; } @@ -760,40 +1073,21 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { continue; } #endif // !ROCKSDB_LITE + Status file_deletion_status; - if (type == kTableFile && path_id == 0) { - file_deletion_status = DeleteOrMoveToTrash(&db_options_, fname); - } else { - file_deletion_status = env_->DeleteFile(fname); - } - if (file_deletion_status.ok()) { - Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, - "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id, - fname.c_str(), type, number, - file_deletion_status.ToString().c_str()); - } else if (env_->FileExists(fname).IsNotFound()) { - Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, - "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 - " -- %s\n", - state.job_id, fname.c_str(), type, number, - file_deletion_status.ToString().c_str()); + if (schedule_only) { + InstrumentedMutexLock guard_lock(&mutex_); + SchedulePendingPurge(fname, type, number, path_id, state.job_id); } else { - Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, - "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", - state.job_id, fname.c_str(), type, number, - file_deletion_status.ToString().c_str()); - } - if (type == kTableFile) { - EventHelpers::LogAndNotifyTableFileDeletion( - &event_logger_, state.job_id, number, fname, - file_deletion_status, GetName(), - db_options_.listeners); + DeleteObsoleteFileImpl(file_deletion_status, state.job_id, fname, type, + number, path_id); } } // Delete old info log files. size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count >= db_options_.keep_log_file_num) { + if (old_info_log_file_count != 0 && + old_info_log_file_count >= db_options_.keep_log_file_num) { std::sort(old_info_log_files.begin(), old_info_log_files.end()); size_t end = old_info_log_file_count - db_options_.keep_log_file_num; for (unsigned int i = 0; i <= end; i++) { @@ -898,7 +1192,7 @@ Directory* DBImpl::Directories::GetDataDir(size_t path_id) { Status DBImpl::Recover( const std::vector& column_families, bool read_only, - bool error_if_log_file_exist) { + bool error_if_log_file_exist, bool error_if_data_exists_in_logs) { mutex_.AssertHeld(); bool is_new_db = false; @@ -955,7 +1249,7 @@ Status DBImpl::Recover( s = CheckConsistency(); } if (s.ok()) { - SequenceNumber max_sequence(kMaxSequenceNumber); + SequenceNumber next_sequence(kMaxSequenceNumber); default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); @@ -969,8 +1263,6 @@ Status DBImpl::Recover( // Note that prev_log_number() is no longer used, but we pay // attention to it in case we are recovering a database // produced by an older version of rocksdb. - const uint64_t min_log = versions_->MinLogNumber(); - const uint64_t prev_log = versions_->prev_log_number(); std::vector filenames; s = env_->GetChildren(db_options_.wal_dir, &filenames); if (!s.ok()) { @@ -987,22 +1279,37 @@ Status DBImpl::Recover( "While creating a new Db, wal_dir contains " "existing log file: ", filenames[i]); - } else if ((number >= min_log) || (number == prev_log)) { + } else { logs.push_back(number); } } } - if (logs.size() > 0 && error_if_log_file_exist) { - return Status::Corruption("" - "The db was opened in readonly mode with error_if_log_file_exist" - "flag but a log file already exists"); + if (logs.size() > 0) { + if (error_if_log_file_exist) { + return Status::Corruption( + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } else if (error_if_data_exists_in_logs) { + for (auto& log : logs) { + std::string fname = LogFileName(db_options_.wal_dir, log); + uint64_t bytes; + s = env_->GetFileSize(fname, &bytes); + if (s.ok()) { + if (bytes > 0) { + return Status::Corruption( + "error_if_data_exists_in_logs is set but there are data " + " in log files."); + } + } + } + } } if (!logs.empty()) { // Recover in the order in which the logs were generated std::sort(logs.begin(), logs.end()); - s = RecoverLogFiles(logs, &max_sequence, read_only); + s = RecoverLogFiles(logs, &next_sequence, read_only); if (!s.ok()) { // Clear memtables if recovery failed for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -1027,7 +1334,7 @@ Status DBImpl::Recover( // REQUIRES: log_numbers are sorted in ascending order Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, - SequenceNumber* max_sequence, bool read_only) { + SequenceNumber* next_sequence, bool read_only) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; @@ -1066,7 +1373,27 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, stream.EndArray(); } - bool continue_replay_log = true; +#ifndef ROCKSDB_LITE + if (db_options_.wal_filter != nullptr) { + std::map cf_name_id_map; + std::map cf_lognumber_map; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_name_id_map.insert( + std::make_pair(cfd->GetName(), cfd->GetID())); + cf_lognumber_map.insert( + std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + } + + db_options_.wal_filter->ColumnFamilyLogNumberMap( + cf_lognumber_map, + cf_name_id_map); + } +#endif + + bool stop_replay_by_wal_filter = false; + bool stop_replay_for_corruption = false; + bool flushed = false; + SequenceNumber recovered_sequence = 0; for (auto log_number : log_numbers) { // The previous incarnation may not have written any MANIFEST // records after allocating this log number. So we manually @@ -1074,6 +1401,23 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, versions_->MarkFileNumberUsedDuringRecovery(log_number); // Open the log file std::string fname = LogFileName(db_options_.wal_dir, log_number); + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + db_options_.wal_recovery_mode); + auto logFileDropped = [this, &fname]() { + uint64_t bytes; + if (env_->GetFileSize(fname, &bytes).ok()) { + auto info_log = db_options_.info_log.get(); + Log(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes", + fname.c_str(), static_cast(bytes)); + } + }; + if (stop_replay_by_wal_filter) { + logFileDropped(); + continue; + } + unique_ptr file_reader; { unique_ptr file; @@ -1107,60 +1451,148 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); - Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, - "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number, - db_options_.wal_recovery_mode, !continue_replay_log); + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, log_number); // Determine if we should tolerate incomplete records at the tail end of the - // log - bool report_eof_inconsistency; - if (db_options_.wal_recovery_mode == - WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files - report_eof_inconsistency = true; - } else { - // for other modes ignore only incomplete records in the last log file - // which is presumably due to write in progress during restart - report_eof_inconsistency = false; - - // TODO krad: Evaluate if we need to move to a more strict mode where we - // restrict the inconsistency to only the last log - } - // Read all the records and add to a memtable std::string scratch; Slice record; WriteBatch batch; - if (!continue_replay_log) { - uint64_t bytes; - if (env_->GetFileSize(fname, &bytes).ok()) { - auto info_log = db_options_.info_log.get(); - Log(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes", - fname.c_str(), static_cast(bytes)); - } - } - - while (continue_replay_log && - reader.ReadRecord(&record, &scratch, report_eof_inconsistency) && - status.ok()) { - if (record.size() < 12) { + while ( + !stop_replay_by_wal_filter && + reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { reporter.Corruption(record.size(), Status::Corruption("log record too small")); continue; } WriteBatchInternal::SetContents(&batch, record); + SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + + // In point-in-time recovery mode, if sequence id of log files are + // consecutive, we continue recovery despite corruption. This could happen + // when we open and write to a corrupted DB, where sequence id will start + // from the last sequence id we recovered. + if (db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + if (sequence == recovered_sequence + 1) { + stop_replay_for_corruption = false; + } + if (stop_replay_for_corruption) { + logFileDropped(); + break; + } + } + + recovered_sequence = sequence; + bool no_prev_seq = true; + if (*next_sequence == kMaxSequenceNumber) { + *next_sequence = sequence; + } else { + no_prev_seq = false; + WriteBatchInternal::SetSequence(&batch, *next_sequence); + } + +#ifndef ROCKSDB_LITE + if (db_options_.wal_filter != nullptr) { + WriteBatch new_batch; + bool batch_changed = false; + + WalFilter::WalProcessingOption wal_processing_option = + db_options_.wal_filter->LogRecordFound(log_number, fname, batch, + &new_batch, &batch_changed); + + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kContinueProcessing: + // do nothing, proceeed normally + break; + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: + // skip current record + continue; + case WalFilter::WalProcessingOption::kStopReplay: + // skip current record and stop replay + stop_replay_by_wal_filter = true; + continue; + case WalFilter::WalProcessingOption::kCorruptedRecord: { + status = Status::Corruption("Corruption reported by Wal Filter ", + db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + reporter.Corruption(record.size(), status); + continue; + } + break; + } + default: { + assert(false); // unhandled case + status = Status::NotSupported( + "Unknown WalProcessingOption returned" + " by Wal Filter ", + db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Ignore the error with current record processing. + continue; + } + } + } + + if (batch_changed) { + // Make sure that the count in the new batch is + // within the orignal count. + int new_count = WriteBatchInternal::Count(&new_batch); + int original_count = WriteBatchInternal::Count(&batch); + if (new_count > original_count) { + Log(InfoLogLevel::FATAL_LEVEL, db_options_.info_log, + "Recovering log #%" PRIu64 + " mode %d log filter %s returned " + "more records (%d) than original (%d) which is not allowed. " + "Aborting recovery.", + log_number, db_options_.wal_recovery_mode, + db_options_.wal_filter->Name(), new_count, original_count); + status = Status::NotSupported( + "More than original # of records " + "returned by Wal Filter ", + db_options_.wal_filter->Name()); + return status; + } + // Set the same sequence number in the new_batch + // as the original batch. + WriteBatchInternal::SetSequence(&new_batch, + WriteBatchInternal::Sequence(&batch)); + batch = new_batch; + } + } +#endif // ROCKSDB_LITE // If column family was not found, it might mean that the WAL write // batch references to the column family that was dropped after the // insert. We don't want to fail the whole write batch in that case -- // we just ignore the update. // That's why we set ignore missing column families to true + bool has_valid_writes = false; + // If we pass DB through and options.max_successive_merges is hit + // during recovery, Get() will be issued which will try to acquire + // DB mutex and cause deadlock, as DB mutex is already held. + // The DB pointer is not needed unless 2PC is used. + // TODO(sdong) fix the allow_2pc case too. status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), true, log_number); - + &batch, column_family_memtables_.get(), &flush_scheduler_, true, + log_number, db_options_.allow_2pc ? this : nullptr, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes); + // If it is the first log file and there is no column family updated + // after replaying the file, this file may be a stale file. We ignore + // sequence IDs from the file. Otherwise, if a newer stale log file that + // has been deleted, the sequenceID may be wrong. + if (no_prev_seq && !has_valid_writes) { + *next_sequence = kMaxSequenceNumber; + } MaybeIgnoreError(&status); if (!status.ok()) { // We are treating this as a failure while reading since we read valid @@ -1169,18 +1601,12 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, continue; } - const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if ((*max_sequence == kMaxSequenceNumber) || (last_seq > *max_sequence)) { - *max_sequence = last_seq; - } - - if (!read_only) { + if (has_valid_writes && !read_only) { // we can do this because this is called before client has access to the // DB and there is only a single thread operating on DB ColumnFamilyData* cfd; - while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { cfd->Unref(); // If this asserts, it means that InsertInto failed in // filtering updates to already-flushed column families @@ -1194,9 +1620,10 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // file-systems cause the DB::Open() to fail. return status; } + flushed = true; cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - *max_sequence); + *next_sequence); } } } @@ -1210,11 +1637,10 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, WALRecoveryMode::kPointInTimeRecovery) { // We should ignore the error but not continue replaying status = Status::OK(); - continue_replay_log = false; - + stop_replay_for_corruption = true; Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64, - log_number, *max_sequence); + log_number, *next_sequence); } else { assert(db_options_.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords @@ -1225,9 +1651,10 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, } flush_scheduler_.Clear(); - if ((*max_sequence != kMaxSequenceNumber) && - (versions_->LastSequence() < *max_sequence)) { - versions_->SetLastSequence(*max_sequence); + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastSequence(last_sequence); } } @@ -1252,14 +1679,20 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // flush the final memtable (if non-empty) if (cfd->mem()->GetFirstSequenceNumber() != 0) { - status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); - if (!status.ok()) { - // Recovery failed - break; - } + // If flush happened in the middle of recovery (e.g. due to memtable + // being full), we flush at the end. Otherwise we'll need to record + // where we were on last flush, which make the logic complicated. + if (flushed || !db_options_.avoid_flush_during_recovery) { + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Recovery failed + break; + } + flushed = true; - cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - *max_sequence); + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + *next_sequence); + } } // write MANIFEST with update @@ -1268,7 +1701,9 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // recovered and should be ignored on next reincarnation. // Since we already recovered max_log_number, we want all logs // with numbers `<= max_log_number` (includes this one) to be ignored - edit->SetLogNumber(max_log_number + 1); + if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { + edit->SetLogNumber(max_log_number + 1); + } // we must mark the next log number as used, even though it's // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any @@ -1294,9 +1729,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; - meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); auto pending_outputs_inserted_elem = CaptureCurrentFileNumberInPendingOutputs(); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -1309,37 +1744,33 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, " Level-0 table #%" PRIu64 ": started", cfd->GetName().c_str(), meta.fd.GetNumber()); + // Get the latest mutable cf options while the mutex is still locked + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); bool paranoid_file_checks = cfd->GetLatestMutableCFOptions()->paranoid_file_checks; { mutex_.Unlock(); - TableFileCreationInfo info; + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + s = BuildTable( - dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), - iter.get(), &meta, cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), snapshots_.GetAll(), - GetCompressionFlush(*cfd->ioptions()), + dbname_, env_, *cfd->ioptions(), mutable_cf_options, env_options_, + cfd->table_cache(), iter.get(), &meta, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), + snapshot_seqs, earliest_write_conflict_snapshot, + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), cfd->ioptions()->compression_opts, paranoid_file_checks, - cfd->internal_stats(), Env::IO_HIGH, &info.table_properties); + cfd->internal_stats(), TableFileCreationReason::kRecovery, + &event_logger_, job_id); LogFlush(db_options_.info_log); Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), s.ToString().c_str()); - - // output to event logger - if (s.ok()) { - info.db_name = dbname_; - info.cf_name = cfd->GetName(); - info.file_path = TableFileName(db_options_.db_paths, - meta.fd.GetNumber(), - meta.fd.GetPathId()); - info.file_size = meta.fd.GetFileSize(); - info.job_id = job_id; - EventHelpers::LogAndNotifyTableFileCreation( - &event_logger_, db_options_.listeners, meta.fd, info); - } mutex_.Lock(); } } @@ -1366,6 +1797,49 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, return s; } +Status DBImpl::SyncClosedLogs(JobContext* job_context) { + mutex_.AssertHeld(); + autovector logs_to_sync; + uint64_t current_log_number = logfile_number_; + while (logs_.front().number < current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number < current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); + } + + Status s; + if (!logs_to_sync.empty()) { + mutex_.Unlock(); + + for (log::Writer* log : logs_to_sync) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, + log->get_log_number()); + s = log->file()->Sync(db_options_.use_fsync); + } + if (s.ok()) { + s = directories_.GetWalDir()->Fsync(); + } + + mutex_.Lock(); + + // "number <= current_log_number - 1" is equivalent to + // "number < current_log_number". + MarkLogsSynced(current_log_number - 1, true, s); + if (!s.ok()) { + bg_error_ = s; + return s; + } + } + return s; +} + Status DBImpl::FlushMemTableToOutputFile( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) { @@ -1373,22 +1847,44 @@ Status DBImpl::FlushMemTableToOutputFile( assert(cfd->imm()->NumNotFlushed() != 0); assert(cfd->imm()->IsFlushPending()); - FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options, - env_options_, versions_.get(), &mutex_, &shutting_down_, - snapshots_.GetAll(), job_context, log_buffer, - directories_.GetDbDir(), directories_.GetDataDir(0U), - GetCompressionFlush(*cfd->ioptions()), stats_, - &event_logger_); + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + + FlushJob flush_job( + dbname_, cfd, db_options_, mutable_cf_options, env_options_, + versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, + earliest_write_conflict_snapshot, job_context, log_buffer, + directories_.GetDbDir(), directories_.GetDataDir(0U), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats); FileMetaData file_meta; + flush_job.PickMemTable(); + + Status s; + if (logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 0 && + !db_options_.disableDataSync) { + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other column families are missing. + // SyncClosedLogs() may unlock and re-lock the db_mutex. + s = SyncClosedLogs(job_context); + } + // Within flush_job.Run, rocksdb may call event listener to notify // file creation and deletion. // // Note that flush_job.Run will unlock and lock the db_mutex, // and EventListener callback will be called when the db_mutex // is unlocked by the current thread. - Status s = flush_job.Run(&file_meta); + if (s.ok()) { + s = flush_job.Run(&file_meta); + } if (s.ok()) { InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context, @@ -1407,20 +1903,33 @@ Status DBImpl::FlushMemTableToOutputFile( // true, mark DB read-only bg_error_ = s; } - RecordFlushIOStats(); -#ifndef ROCKSDB_LITE if (s.ok()) { +#ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, - job_context->job_id); - } + job_context->job_id, flush_job.GetTableProperties()); #endif // ROCKSDB_LITE + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName(db_options_.db_paths[0].path, + file_meta.fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()) { + bg_error_ = Status::IOError("Max allowed space was reached"); + TEST_SYNC_POINT( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached"); + } + } + } return s; } -void DBImpl::NotifyOnFlushCompleted( - ColumnFamilyData* cfd, FileMetaData* file_meta, - const MutableCFOptions& mutable_cf_options, int job_id) { +void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, + FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id, TableProperties prop) { #ifndef ROCKSDB_LITE if (db_options_.listeners.size() == 0U) { return; @@ -1450,6 +1959,7 @@ void DBImpl::NotifyOnFlushCompleted( info.triggered_writes_stop = triggered_writes_stop; info.smallest_seqno = file_meta->smallest_seqno; info.largest_seqno = file_meta->largest_seqno; + info.table_properties = prop; for (auto listener : db_options_.listeners) { listener->OnFlushCompleted(this, info); } @@ -1469,6 +1979,7 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); + bool exclusive = options.exclusive_manual_compaction; Status s = FlushMemTable(cfd, FlushOptions()); if (!s.ok()) { @@ -1494,7 +2005,7 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, // Always compact all files together. s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, cfd->NumberLevels() - 1, options.target_path_id, - begin, end); + begin, end, exclusive); final_output_level = cfd->NumberLevels() - 1; } else { for (int level = 0; level <= max_level_with_files; level++) { @@ -1529,7 +2040,7 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, } } s = RunManualCompaction(cfd, level, output_level, options.target_path_id, - begin, end); + begin, end, exclusive); if (!s.ok()) { break; } @@ -1690,19 +2201,29 @@ Status DBImpl::CompactFilesImpl( c.reset(cfd->compaction_picker()->FormCompaction( compact_options, input_files, output_level, version->storage_info(), *cfd->GetLatestMutableCFOptions(), output_path_id)); - assert(c); + if (!c) { + return Status::Aborted("Another Level 0 compaction is running"); + } c->SetInputVersion(version); // deletion compaction currently not allowed in CompactFiles. assert(!c->deletion_compaction()); + running_compactions_.insert(c.get()); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, snapshots_.GetAll(), - table_cache_, &event_logger_, - c->mutable_cf_options()->paranoid_file_checks, - c->mutable_cf_options()->compaction_measure_io_stats, dbname_, + directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_, + snapshot_seqs, earliest_write_conflict_snapshot, table_cache_, + &event_logger_, c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, nullptr); // Here we pass a nullptr for CompactionJobStats because // CompactFiles does not trigger OnCompactionCompleted(), // which is the only place where CompactionJobStats is @@ -1716,19 +2237,33 @@ Status DBImpl::CompactFilesImpl( // support for CompactFiles, we should have CompactFiles API // pass a pointer of CompactionJobStats as the out-value // instead of using EventListener. + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + version->storage_info()->ComputeCompactionScore(*c->mutable_cf_options()); + compaction_job.Prepare(); mutex_.Unlock(); + TEST_SYNC_POINT("CompactFilesImpl:0"); + TEST_SYNC_POINT("CompactFilesImpl:1"); compaction_job.Run(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); - Status status = compaction_job.Install(*c->mutable_cf_options(), &mutex_); + Status status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { InstallSuperVersionAndScheduleWorkWrapper( c->column_family_data(), job_context, *c->mutable_cf_options()); } c->ReleaseCompactionFiles(s); - c.reset(); + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + running_compactions_.erase(c.get()); if (status.ok()) { // Done @@ -1744,6 +2279,8 @@ Status DBImpl::CompactFilesImpl( } } + c.reset(); + bg_compaction_scheduled_--; if (bg_compaction_scheduled_ == 0) { bg_cv_.SignalAll(); @@ -1755,17 +2292,25 @@ Status DBImpl::CompactFilesImpl( Status DBImpl::PauseBackgroundWork() { InstrumentedMutexLock guard_lock(&mutex_); - bg_work_paused_++; + bg_compaction_paused_++; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) { bg_cv_.Wait(); } + bg_work_paused_++; return Status::OK(); } Status DBImpl::ContinueBackgroundWork() { InstrumentedMutexLock guard_lock(&mutex_); + if (bg_work_paused_ == 0) { + return Status::InvalidArgument(); + } assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ if (bg_work_paused_ == 0) { MaybeScheduleFlushOrCompaction(); } @@ -1786,6 +2331,7 @@ void DBImpl::NotifyOnCompactionCompleted( } // release lock while notifying events mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); { CompactionJobInfo info; info.cf_name = cfd->GetName(); @@ -1795,12 +2341,21 @@ void DBImpl::NotifyOnCompactionCompleted( info.base_input_level = c->start_level(); info.output_level = c->output_level(); info.stats = compaction_job_stats; + info.table_properties = c->GetOutputTableProperties(); + info.compaction_reason = c->compaction_reason(); + info.compression = c->output_compression(); for (size_t i = 0; i < c->num_input_levels(); ++i) { for (const auto fmd : *c->inputs(i)) { - info.input_files.push_back( - TableFileName(db_options_.db_paths, - fmd->fd.GetNumber(), - fmd->fd.GetPathId())); + auto fn = TableFileName(db_options_.db_paths, fmd->fd.GetNumber(), + fmd->fd.GetPathId()); + info.input_files.push_back(fn); + if (info.table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = cfd->current()->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + info.table_properties[fn] = tp; + } + } } } for (const auto newf : c->edit()->GetNewFiles()) { @@ -1834,11 +2389,26 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family, MutableCFOptions new_options; Status s; + Status persist_options_status; { InstrumentedMutexLock l(&mutex_); s = cfd->SetOptions(options_map); if (s.ok()) { new_options = *cfd->GetLatestMutableCFOptions(); + // Trigger possible flush/compactions. This has to be before we persist + // options to file, otherwise there will be a deadlock with writer + // thread. + auto* old_sv = + InstallSuperVersionAndScheduleWork(cfd, nullptr, new_options); + delete old_sv; + + // Persist RocksDB options under the single write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + + persist_options_status = WriteOptionsFile(); + + write_thread_.ExitUnbatched(&w); } } @@ -1854,6 +2424,16 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family, db_options_.info_log, "[%s] SetOptions succeeded", cfd->GetName().c_str()); new_options.Dump(db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetOptions succeeded, but unable to persist options", + persist_options_status.ToString()); + } + Warn(db_options_.info_log, + "Unable to persist options in SetOptions() -- %s", + persist_options_status.ToString().c_str()); + } } else { Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "[%s] SetOptions failed", cfd->GetName().c_str()); @@ -2037,10 +2617,12 @@ Status DBImpl::SyncWAL() { status = directories_.GetWalDir()->Fsync(); } + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); { InstrumentedMutexLock l(&mutex_); MarkLogsSynced(current_log_number, need_log_dir_sync, status); } + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); return status; } @@ -2064,7 +2646,8 @@ void DBImpl::MarkLogsSynced( ++it; } } - assert(logs_.empty() || (logs_.size() == 1 && !logs_[0].getting_synced)); + assert(logs_.empty() || logs_[0].number > up_to || + (logs_.size() == 1 && !logs_[0].getting_synced)); log_sync_cv_.SignalAll(); } @@ -2075,12 +2658,15 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const { Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, int output_level, uint32_t output_path_id, const Slice* begin, const Slice* end, - bool disallow_trivial_move) { + bool exclusive, bool disallow_trivial_move) { assert(input_level == ColumnFamilyData::kCompactAllLevels || input_level >= 0); InternalKey begin_storage, end_storage; + CompactionArg* ca; + bool scheduled = false; + bool manual_conflict = false; ManualCompaction manual; manual.cfd = cfd; manual.input_level = input_level; @@ -2088,6 +2674,8 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, manual.output_path_id = output_path_id; manual.done = false; manual.in_progress = false; + manual.incomplete = false; + manual.exclusive = exclusive; manual.disallow_trivial_move = disallow_trivial_move; // For universal compaction, we enforce every manual compaction to compact // all files. @@ -2115,7 +2703,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, // jobs drops to zero. This is needed to ensure that this manual compaction // can compact any range of keys/files. // - // bg_manual_only_ is non-zero when at least one thread is inside + // HasPendingManualCompaction() is true when at least one thread is inside // RunManualCompaction(), i.e. during that time no other compaction will // get scheduled (see MaybeScheduleFlushOrCompaction). // @@ -2124,13 +2712,16 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, // However, only one of them will actually schedule compaction, while // others will wait on a condition variable until it completes. - ++bg_manual_only_; - while (bg_compaction_scheduled_ > 0) { - Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, - "[%s] Manual compaction waiting for all other scheduled background " - "compactions to finish", - cfd->GetName().c_str()); - bg_cv_.Wait(); + AddManualCompaction(&manual); + TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); + if (exclusive) { + while (bg_compaction_scheduled_ > 0) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } } Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, @@ -2141,23 +2732,68 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, // the compaction will set manual.status to bg_error_ and set manual.done to // true. while (!manual.done) { - assert(bg_manual_only_ > 0); - if (manual_compaction_ != nullptr) { + assert(HasPendingManualCompaction()); + manual_conflict = false; + if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || + scheduled || + ((manual.manual_end = &manual.tmp_storage1)&&( + (manual.compaction = manual.cfd->CompactRange( + *manual.cfd->GetLatestMutableCFOptions(), manual.input_level, + manual.output_level, manual.output_path_id, manual.begin, + manual.end, &manual.manual_end, &manual_conflict)) == + nullptr) && + manual_conflict)) { + // exclusive manual compactions should not see a conflict during + // CompactRange + assert(!exclusive || !manual_conflict); // Running either this or some other manual compaction bg_cv_.Wait(); - } else { - manual_compaction_ = &manual; + if (scheduled && manual.incomplete == true) { + assert(!manual.in_progress); + scheduled = false; + manual.incomplete = false; + } + } else if (!scheduled) { + if (manual.compaction == nullptr) { + manual.done = true; + bg_cv_.SignalAll(); + continue; + } + ca = new CompactionArg; + ca->db = this; + ca->m = &manual; + manual.incomplete = false; bg_compaction_scheduled_++; - env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW, this); + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCallback); + scheduled = true; } } assert(!manual.in_progress); - assert(bg_manual_only_ > 0); - --bg_manual_only_; + assert(HasPendingManualCompaction()); + RemoveManualCompaction(&manual); + bg_cv_.SignalAll(); return manual.status; } +InternalIterator* DBImpl::NewInternalIterator( + Arena* arena, ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + ReadOptions roptions; + return NewInternalIterator(roptions, cfd, super_version, arena); +} + Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& flush_options) { Status s; @@ -2208,6 +2844,20 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { return s; } +Status DBImpl::EnableAutoCompaction( + const std::vector& column_family_handles) { + Status s; + for (auto cf_ptr : column_family_handles) { + Status status = + this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); + if (!status.ok()) { + s = status; + } + } + + return s; +} + void DBImpl::MaybeScheduleFlushOrCompaction() { mutex_.AssertHeld(); if (!opened_successfully_) { @@ -2229,29 +2879,57 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); } + auto bg_compactions_allowed = BGCompactionsAllowed(); + // special case -- if max_background_flushes == 0, then schedule flush on a // compaction thread if (db_options_.max_background_flushes == 0) { while (unscheduled_flushes_ > 0 && bg_flush_scheduled_ + bg_compaction_scheduled_ < - db_options_.max_background_compactions) { + bg_compactions_allowed) { unscheduled_flushes_--; bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); } } - if (bg_manual_only_) { + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } + + if (HasExclusiveManualCompaction()) { // only manual compactions are allowed to run. don't schedule automatic // compactions return; } - while (bg_compaction_scheduled_ < db_options_.max_background_compactions && + while (bg_compaction_scheduled_ < bg_compactions_allowed && unscheduled_compactions_ > 0) { + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->m = nullptr; bg_compaction_scheduled_++; unscheduled_compactions_--; - env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW, this); + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCallback); + } +} + +void DBImpl::SchedulePurge() { + mutex_.AssertHeld(); + assert(opened_successfully_); + + // Purge operations are put into High priority queue + bg_purge_scheduled_++; + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); +} + +int DBImpl::BGCompactionsAllowed() const { + if (write_controller_.NeedSpeedupCompaction()) { + return db_options_.max_background_compactions; + } else { + return db_options_.base_background_compactions; } } @@ -2301,9 +2979,12 @@ void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { } } -void DBImpl::RecordFlushIOStats() { - RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); - IOSTATS_RESET(bytes_written); +void DBImpl::SchedulePendingPurge(std::string fname, FileType type, + uint64_t number, uint32_t path_id, + int job_id) { + mutex_.AssertHeld(); + PurgeFileInfo file_info(fname, type, number, path_id, job_id); + purge_queue_.push_back(std::move(file_info)); } void DBImpl::BGWorkFlush(void* db) { @@ -2313,10 +2994,68 @@ void DBImpl::BGWorkFlush(void* db) { TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); } -void DBImpl::BGWorkCompaction(void* db) { +void DBImpl::BGWorkCompaction(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); - reinterpret_cast(db)->BackgroundCallCompaction(); + reinterpret_cast(ca.db)->BackgroundCallCompaction(ca.m); +} + +void DBImpl::BGWorkPurge(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); + reinterpret_cast(db)->BackgroundCallPurge(); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); +} + +void DBImpl::UnscheduleCallback(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + if ((ca.m != nullptr) && (ca.m->compaction != nullptr)) { + delete ca.m->compaction; + } + TEST_SYNC_POINT("DBImpl::UnscheduleCallback"); +} + +void DBImpl::BackgroundCallPurge() { + mutex_.Lock(); + + // We use one single loop to clear both queues so that after existing the loop + // both queues are empty. This is stricter than what is needed, but can make + // it easier for us to reason the correctness. + while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) { + if (!purge_queue_.empty()) { + auto purge_file = purge_queue_.begin(); + auto fname = purge_file->fname; + auto type = purge_file->type; + auto number = purge_file->number; + auto path_id = purge_file->path_id; + auto job_id = purge_file->job_id; + purge_queue_.pop_front(); + + mutex_.Unlock(); + Status file_deletion_status; + DeleteObsoleteFileImpl(file_deletion_status, job_id, fname, type, number, + path_id); + mutex_.Lock(); + } else { + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); + } + } + bg_purge_scheduled_--; + + bg_cv_.SignalAll(); + // IMPORTANT:there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + mutex_.Unlock(); } Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, @@ -2356,10 +3095,10 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, LogToBuffer( log_buffer, "Calling FlushMemTableToOutputFile with column " - "family [%s], flush slots available %d, compaction slots available %d", - cfd->GetName().c_str(), - db_options_.max_background_flushes - bg_flush_scheduled_, - db_options_.max_background_compactions - bg_compaction_scheduled_); + "family [%s], flush slots available %d, compaction slots allowed %d, " + "compaction slots scheduled %d", + cfd->GetName().c_str(), db_options_.max_background_flushes, + bg_flush_scheduled_, BGCompactionsAllowed() - bg_compaction_scheduled_); status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, job_context, log_buffer); if (cfd->Unref()) { @@ -2374,9 +3113,12 @@ void DBImpl::BackgroundCallFlush() { JobContext job_context(next_job_id_.fetch_add(1), true); assert(bg_flush_scheduled_); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start"); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { InstrumentedMutexLock l(&mutex_); + num_running_flushes_++; auto pending_outputs_inserted_elem = CaptureCurrentFileNumberInPendingOutputs(); @@ -2422,10 +3164,11 @@ void DBImpl::BackgroundCallFlush() { mutex_.Lock(); } + assert(num_running_flushes_ > 0); + num_running_flushes_--; bg_flush_scheduled_--; // See if there's more work to be done MaybeScheduleFlushOrCompaction(); - RecordFlushIOStats(); bg_cv_.SignalAll(); // IMPORTANT: there should be no code after calling SignalAll. This call may // signal the DB destructor that it's OK to proceed with destruction. In @@ -2434,20 +3177,24 @@ void DBImpl::BackgroundCallFlush() { } } -void DBImpl::BackgroundCallCompaction() { +void DBImpl::BackgroundCallCompaction(void* arg) { bool made_progress = false; + ManualCompaction* m = reinterpret_cast(arg); JobContext job_context(next_job_id_.fetch_add(1), true); - + TEST_SYNC_POINT("BackgroundCallCompaction:0"); MaybeDumpStats(); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { InstrumentedMutexLock l(&mutex_); + num_running_compactions_++; auto pending_outputs_inserted_elem = CaptureCurrentFileNumberInPendingOutputs(); assert(bg_compaction_scheduled_); - Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer); + Status s = + BackgroundCompaction(&made_progress, &job_context, &log_buffer, m); + TEST_SYNC_POINT("BackgroundCallCompaction:1"); if (!s.ok() && !s.IsShutdownInProgress()) { // Wait a little bit before retrying background compaction in // case this is an environmental problem and we do not want to @@ -2490,17 +3237,20 @@ void DBImpl::BackgroundCallCompaction() { mutex_.Lock(); } + assert(num_running_compactions_ > 0); + num_running_compactions_--; bg_compaction_scheduled_--; versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); // See if there's more work to be done MaybeScheduleFlushOrCompaction(); - if (made_progress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) { + if (made_progress || bg_compaction_scheduled_ == 0 || + HasPendingManualCompaction()) { // signal if // * made_progress -- need to wakeup DelayWrite // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl - // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction + // * HasPendingManualCompaction -- need to wakeup RunManualCompaction // If none of this is true, there is no need to signal since nobody is // waiting for it bg_cv_.SignalAll(); @@ -2514,14 +3264,18 @@ void DBImpl::BackgroundCallCompaction() { Status DBImpl::BackgroundCompaction(bool* made_progress, JobContext* job_context, - LogBuffer* log_buffer) { + LogBuffer* log_buffer, void* arg) { + ManualCompaction* manual_compaction = + reinterpret_cast(arg); *made_progress = false; mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); - bool is_manual = (manual_compaction_ != nullptr) && - (manual_compaction_->in_progress == false); - bool trivial_move_disallowed = is_manual && - manual_compaction_->disallow_trivial_move; + bool is_manual = (manual_compaction != nullptr); + + // (manual_compaction->in_progress == false); + bool trivial_move_disallowed = + is_manual && manual_compaction->disallow_trivial_move; CompactionJobStats compaction_job_stats; Status status = bg_error_; @@ -2531,34 +3285,30 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, if (!status.ok()) { if (is_manual) { - manual_compaction_->status = status; - manual_compaction_->done = true; - manual_compaction_->in_progress = false; - manual_compaction_ = nullptr; + manual_compaction->status = status; + manual_compaction->done = true; + manual_compaction->in_progress = false; + delete manual_compaction->compaction; + manual_compaction = nullptr; } return status; } if (is_manual) { // another thread cannot pick up the same work - manual_compaction_->in_progress = true; - } else if (manual_compaction_ != nullptr) { - // there should be no automatic compactions running when manual compaction - // is running - return Status::OK(); + manual_compaction->in_progress = true; } unique_ptr c; - InternalKey manual_end_storage; - InternalKey* manual_end = &manual_end_storage; + // InternalKey manual_end_storage; + // InternalKey* manual_end = &manual_end_storage; if (is_manual) { - ManualCompaction* m = manual_compaction_; + ManualCompaction* m = manual_compaction; assert(m->in_progress); - c.reset(m->cfd->CompactRange( - *m->cfd->GetLatestMutableCFOptions(), m->input_level, m->output_level, - m->output_path_id, m->begin, m->end, &manual_end)); + c.reset(std::move(m->compaction)); if (!c) { m->done = true; + m->manual_end = nullptr; LogToBuffer(log_buffer, "[%s] Manual compaction from level-%d from %s .. " "%s; nothing to do\n", @@ -2572,9 +3322,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, m->cfd->GetName().c_str(), m->input_level, c->output_level(), (m->begin ? m->begin->DebugString().c_str() : "(begin)"), (m->end ? m->end->DebugString().c_str() : "(end)"), - ((m->done || manual_end == nullptr) + ((m->done || m->manual_end == nullptr) ? "(end)" - : manual_end->DebugString().c_str())); + : m->manual_end->DebugString().c_str())); } } else if (!compaction_queue_.empty()) { // cfd is referenced here @@ -2591,6 +3341,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, return Status::OK(); } + if (HaveManualCompaction(cfd)) { + // Can't compact right now, but try again later + TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); + return Status::OK(); + } + // Pick up latest mutable CF Options and use it throughout the // compaction job // Compaction makes a copy of the latest MutableCFOptions. It should be used @@ -2601,7 +3357,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // NOTE: try to avoid unnecessary copy of MutableCFOptions if // compaction is not necessary. Need to make sure mutex is held // until we make a copy in the following code + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); if (c != nullptr) { // update statistics MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, @@ -2628,6 +3386,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } } + if (c != nullptr) { + running_compactions_.insert(c.get()); + } + if (!c) { // Nothing to do LogToBuffer(log_buffer, "Compaction nothing to do"); @@ -2657,7 +3419,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); // Instrument for event update // TODO(yhchiang): add op details for showing trivial-move. - ThreadStatusUtil::SetColumnFamily(c->column_family_data()); + ThreadStatusUtil::SetColumnFamily( + c->column_family_data(), c->column_family_data()->ioptions()->env, + c->column_family_data()->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); compaction_job_stats.num_input_files = c->num_input_files(0); @@ -2717,14 +3481,20 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, int output_level __attribute__((unused)) = c->output_level(); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", &output_level); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, - snapshots_.GetAll(), table_cache_, &event_logger_, + directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, + &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot, + table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, - c->mutable_cf_options()->compaction_measure_io_stats, dbname_, + c->mutable_cf_options()->report_bg_io_stats, dbname_, &compaction_job_stats); compaction_job.Prepare(); @@ -2733,7 +3503,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); - status = compaction_job.Install(*c->mutable_cf_options(), &mutex_); + status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { InstallSuperVersionAndScheduleWorkWrapper( c->column_family_data(), job_context, *c->mutable_cf_options()); @@ -2741,11 +3511,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, *made_progress = true; } if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; NotifyOnCompactionCompleted( c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); - c->ReleaseCompactionFiles(status); - *made_progress = true; + running_compactions_.erase(c.get()); } // this will unref its input_version and column_family_data c.reset(); @@ -2763,7 +3534,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } if (is_manual) { - ManualCompaction* m = manual_compaction_; + ManualCompaction* m = manual_compaction; if (!status.ok()) { m->status = status; m->done = true; @@ -2781,7 +3552,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // Stop the compaction if manual_end points to nullptr -- this means // that we compacted the whole range. manual_end should always point // to nullptr in case of universal compaction - if (manual_end == nullptr) { + if (m->manual_end == nullptr) { m->done = true; } if (!m->done) { @@ -2792,23 +3563,116 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, kCompactionStyleUniversal || m->cfd->ioptions()->num_levels > 1); assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); - m->tmp_storage = *manual_end; + m->tmp_storage = *m->manual_end; m->begin = &m->tmp_storage; + m->incomplete = true; } m->in_progress = false; // not being processed anymore - manual_compaction_ = nullptr; } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); return status; } +bool DBImpl::HasPendingManualCompaction() { + return (!manual_compaction_dequeue_.empty()); +} + +void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) { + manual_compaction_dequeue_.push_back(m); +} + +void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) { + // Remove from queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it = manual_compaction_dequeue_.erase(it); + return; + } + it++; + } + assert(false); + return; +} + +bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) { + if (m->exclusive) { + return (bg_compaction_scheduled_ > 0); + } + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + bool seen = false; + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it++; + seen = true; + continue; + } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { + // Consider the other manual compaction *it, conflicts if: + // overlaps with m + // and (*it) is ahead in the queue and is not yet in progress + return true; + } + it++; + } + return false; +} + +bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { + // Allow automatic compaction if manual compaction is + // is in progress + return true; + } + it++; + } + return false; +} + +bool DBImpl::HasExclusiveManualCompaction() { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + it++; + } + return false; +} + +bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) { + if ((m->exclusive) || (m1->exclusive)) { + return true; + } + if (m->cfd != m1->cfd) { + return false; + } + return true; +} + namespace { struct IterState { - IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version) - : db(_db), mu(_mu), super_version(_super_version) {} + IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version, + bool _background_purge) + : db(_db), + mu(_mu), + super_version(_super_version), + background_purge(_background_purge) {} DBImpl* db; InstrumentedMutex* mu; SuperVersion* super_version; + bool background_purge; }; static void CleanupIteratorState(void* arg1, void* arg2) { @@ -2822,11 +3686,24 @@ static void CleanupIteratorState(void* arg1, void* arg2) { state->mu->Lock(); state->super_version->Cleanup(); state->db->FindObsoleteFiles(&job_context, false, true); + if (state->background_purge) { + state->db->ScheduleBgLogWriterClose(&job_context); + } state->mu->Unlock(); delete state->super_version; if (job_context.HaveSomethingToDelete()) { - state->db->PurgeObsoleteFiles(job_context); + if (state->background_purge) { + // PurgeObsoleteFiles here does not delete files. Instead, it adds the + // files to be deleted to a job queue, and deletes it in a separate + // background thread. + state->db->PurgeObsoleteFiles(job_context, true /* schedule only */); + state->mu->Lock(); + state->db->SchedulePurge(); + state->mu->Unlock(); + } else { + state->db->PurgeObsoleteFiles(job_context); + } } job_context.Clean(); } @@ -2835,11 +3712,11 @@ static void CleanupIteratorState(void* arg1, void* arg2) { } } // namespace -Iterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, - ColumnFamilyData* cfd, - SuperVersion* super_version, - Arena* arena) { - Iterator* internal_iter; +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena) { + InternalIterator* internal_iter; assert(arena != nullptr); // Need to create internal iterator from the arena. MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); @@ -2852,7 +3729,9 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, super_version->current->AddIterators(read_options, env_options_, &merge_iter_builder); internal_iter = merge_iter_builder.Finish(); - IterState* cleanup = new IterState(this, &mutex_, super_version); + IterState* cleanup = + new IterState(this, &mutex_, super_version, + read_options.background_purge_on_iterator_cleanup); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); return internal_iter; @@ -2948,13 +3827,19 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, LookupKey lkey(key, snapshot); PERF_TIMER_STOP(get_snapshot_time); - if (sv->mem->Get(lkey, value, &s, &merge_context)) { - // Done - RecordTick(stats_, MEMTABLE_HIT); - } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { - // Done - RecordTick(stats_, MEMTABLE_HIT); - } else { + bool skip_memtable = + (read_options.read_tier == kPersistedTier && has_unpersisted_data_); + bool done = false; + if (!skip_memtable) { + if (sv->mem->Get(lkey, value, &s, &merge_context)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); sv->current->Get(read_options, lkey, value, &s, &merge_context, value_found); @@ -2968,6 +3853,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, RecordTick(stats_, NUMBER_KEYS_READ); RecordTick(stats_, BYTES_READ, value->size()); + MeasureTime(stats_, BYTES_PER_READ, value->size()); } return s; } @@ -3038,14 +3924,23 @@ std::vector DBImpl::MultiGet( assert(mgd_iter != multiget_cf_data.end()); auto mgd = mgd_iter->second; auto super_version = mgd->super_version; - if (super_version->mem->Get(lkey, value, &s, &merge_context)) { - // Done - } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { - // Done - } else { + bool skip_memtable = + (read_options.read_tier == kPersistedTier && has_unpersisted_data_); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get(lkey, value, &s, &merge_context)) { + done = true; + // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? + } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { + done = true; + // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? + } + } + if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); super_version->current->Get(read_options, lkey, value, &s, &merge_context); + // TODO(?): RecordTick(stats_, MEMTABLE_MISS)? } if (s.ok()) { @@ -3078,233 +3973,23 @@ std::vector DBImpl::MultiGet( RecordTick(stats_, NUMBER_MULTIGET_CALLS); RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read); PERF_TIMER_STOP(get_post_process_time); return stat_list; } -#ifndef ROCKSDB_LITE -Status DBImpl::AddFile(ColumnFamilyHandle* column_family, - const std::string& file_path, bool move_file) { - Status status; - auto cfh = reinterpret_cast(column_family); - ColumnFamilyData* cfd = cfh->cfd(); - - ExternalSstFileInfo file_info; - file_info.file_path = file_path; - status = env_->GetFileSize(file_path, &file_info.file_size); - if (!status.ok()) { - return status; - } - - // Access the file using TableReader to extract - // version, number of entries, smallest user key, largest user key - std::unique_ptr sst_file; - status = env_->NewRandomAccessFile(file_path, &sst_file, env_options_); - if (!status.ok()) { - return status; - } - std::unique_ptr sst_file_reader; - sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file))); - - std::unique_ptr table_reader; - status = cfd->ioptions()->table_factory->NewTableReader( - TableReaderOptions(*cfd->ioptions(), env_options_, - cfd->internal_comparator()), - std::move(sst_file_reader), file_info.file_size, &table_reader); - if (!status.ok()) { - return status; - } - - // Get the external sst file version from table properties - const UserCollectedProperties& user_collected_properties = - table_reader->GetTableProperties()->user_collected_properties; - UserCollectedProperties::const_iterator external_sst_file_version_iter = - user_collected_properties.find(ExternalSstFilePropertyNames::kVersion); - if (external_sst_file_version_iter == user_collected_properties.end()) { - return Status::InvalidArgument("Generated table version not found"); - } - - file_info.version = - DecodeFixed32(external_sst_file_version_iter->second.c_str()); - if (file_info.version == 1) { - // version 1 imply that all sequence numbers in table equal 0 - file_info.sequence_number = 0; - } else { - return Status::InvalidArgument("Generated table version is not supported"); - } - - // Get number of entries in table - file_info.num_entries = table_reader->GetTableProperties()->num_entries; - - ParsedInternalKey key; - std::unique_ptr iter(table_reader->NewIterator(ReadOptions())); - - // Get first (smallest) key from file - iter->SeekToFirst(); - if (!ParseInternalKey(iter->key(), &key)) { - return Status::Corruption("Generated table have corrupted keys"); - } - if (key.sequence != 0) { - return Status::Corruption("Generated table have non zero sequence number"); - } - file_info.smallest_key = key.user_key.ToString(); - - // Get last (largest) key from file - iter->SeekToLast(); - if (!ParseInternalKey(iter->key(), &key)) { - return Status::Corruption("Generated table have corrupted keys"); - } - if (key.sequence != 0) { - return Status::Corruption("Generated table have non zero sequence number"); - } - file_info.largest_key = key.user_key.ToString(); - - return AddFile(column_family, &file_info, move_file); -} - -Status DBImpl::AddFile(ColumnFamilyHandle* column_family, - const ExternalSstFileInfo* file_info, bool move_file) { - Status status; - auto cfh = reinterpret_cast(column_family); - ColumnFamilyData* cfd = cfh->cfd(); - - if (cfd->NumberLevels() <= 1) { - return Status::NotSupported( - "AddFile requires a database with at least 2 levels"); - } - if (file_info->version != 1) { - return Status::InvalidArgument("Generated table version is not supported"); - } - // version 1 imply that file have only Put Operations with Sequence Number = 0 - - FileMetaData meta; - meta.smallest = - InternalKey(file_info->smallest_key, file_info->sequence_number, - ValueType::kTypeValue); - meta.largest = InternalKey(file_info->largest_key, file_info->sequence_number, - ValueType::kTypeValue); - if (!meta.smallest.Valid() || !meta.largest.Valid()) { - return Status::Corruption("Generated table have corrupted keys"); - } - meta.smallest_seqno = file_info->sequence_number; - meta.largest_seqno = file_info->sequence_number; - if (meta.smallest_seqno != 0 || meta.largest_seqno != 0) { - return Status::InvalidArgument( - "Non zero sequence numbers are not supported"); - } - // Generate a location for the new table - meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, file_info->file_size); - std::string db_fname = TableFileName( - db_options_.db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); - - if (move_file) { - status = env_->LinkFile(file_info->file_path, db_fname); - if (status.IsNotSupported()) { - // Original file is on a different FS, use copy instead of hard linking - status = CopyFile(env_, file_info->file_path, db_fname, 0); - } - } else { - status = CopyFile(env_, file_info->file_path, db_fname, 0); - } - if (!status.ok()) { - return status; - } - - { - InstrumentedMutexLock l(&mutex_); - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); - - WriteThread::Writer w; - write_thread_.EnterUnbatched(&w, &mutex_); - - // Make sure memtables are empty - if (!cfd->mem()->IsEmpty() || cfd->imm()->NumNotFlushed() > 0) { - // Cannot add the file since the keys in memtable - // will hide the keys in file - status = Status::NotSupported("Memtable is not empty"); - } - - // Make sure last sequence number is 0, if there are existing files then - // they should have sequence number = 0 - if (status.ok() && versions_->LastSequence() > 0) { - status = Status::NotSupported("Last Sequence number is not zero"); - } - - auto* vstorage = cfd->current()->storage_info(); - if (status.ok()) { - // Make sure that the key range in the file we will add does not overlap - // with previously added files - Slice smallest_user_key = meta.smallest.user_key(); - Slice largest_user_key = meta.largest.user_key(); - for (int level = 0; level < vstorage->num_non_empty_levels(); level++) { - if (vstorage->OverlapInLevel(level, &smallest_user_key, - &largest_user_key)) { - status = Status::NotSupported("Cannot add overlapping files"); - break; - } - } - } - - if (status.ok()) { - // We add the file to the last level - int target_level = cfd->NumberLevels() - 1; - if (cfd->ioptions()->level_compaction_dynamic_level_bytes == false) { - // If we are using dynamic level compaction we add the file to - // last level with files - target_level = vstorage->num_non_empty_levels() - 1; - if (target_level <= 0) { - target_level = 1; - } - } - VersionEdit edit; - edit.SetColumnFamily(cfd->GetID()); - edit.AddFile(target_level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.smallest_seqno, meta.largest_seqno, - meta.marked_for_compaction); - - status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, - directories_.GetDbDir()); - } - write_thread_.ExitUnbatched(&w); - - if (status.ok()) { - delete InstallSuperVersionAndScheduleWork(cfd, nullptr, - mutable_cf_options); - } - } - - if (!status.ok()) { - // We failed to add the file to the database - Status s = env_->DeleteFile(db_fname); - if (!s.ok()) { - Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, - "AddFile() clean up for file %s failed : %s", db_fname.c_str(), - s.ToString().c_str()); - } - } else if (status.ok() && move_file) { - // The file was moved and added successfully, remove original file link - Status s = env_->DeleteFile(file_info->file_path); - if (!s.ok()) { - Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, - "%s was added to DB successfully but failed to remove original file " - "link : %s", - file_info->file_path.c_str(), s.ToString().c_str()); - } - } - return status; -} -#endif // ROCKSDB_LITE - Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { Status s; + Status persist_options_status; *handle = nullptr; s = CheckCompressionSupported(cf_options); + if (s.ok() && db_options_.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } if (!s.ok()) { return s; } @@ -3334,6 +4019,12 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, s = versions_->LogAndApply( nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit, &mutex_, directories_.GetDbDir(), false, &cf_options); + + if (s.ok()) { + // If the column family was created successfully, we then persist + // the updated RocksDB options under the same single write thread + persist_options_status = WriteOptionsFile(); + } write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3363,6 +4054,17 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, if (s.ok()) { NewThreadStatusCfInfo( reinterpret_cast(*handle)->cfd()); + if (!persist_options_status.ok()) { + if (db_options_.fail_if_options_file_error) { + s = Status::IOError( + "ColumnFamily has been created, but unable to persist" + "options in CreateColumnFamily()", + persist_options_status.ToString().c_str()); + } + Warn(db_options_.info_log, + "Unable to persist options in CreateColumnFamily() -- %s", + persist_options_status.ToString().c_str()); + } } return s; } @@ -3381,6 +4083,7 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { edit.SetColumnFamily(cfd->GetID()); Status s; + Status options_persist_status; { InstrumentedMutexLock l(&mutex_); if (cfd->IsDropped()) { @@ -3392,6 +4095,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { write_thread_.EnterUnbatched(&w, &mutex_); s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_); + if (s.ok()) { + // If the column family was dropped successfully, we then persist + // the updated RocksDB options under the same single write thread + options_persist_status = WriteOptionsFile(); + } write_thread_.ExitUnbatched(&w); } @@ -3419,8 +4127,19 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * mutable_cf_options->max_write_buffer_number; Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, - "Dropped column family with id %u\n", - cfd->GetID()); + "Dropped column family with id %u\n", cfd->GetID()); + + if (!options_persist_status.ok()) { + if (db_options_.fail_if_options_file_error) { + s = Status::IOError( + "ColumnFamily has been dropped, but unable to persist " + "options in DropColumnFamily()", + options_persist_status.ToString().c_str()); + } + Warn(db_options_.info_log, + "Unable to persist options in DropColumnFamily() -- %s", + options_persist_status.ToString().c_str()); + } } else { Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, "Dropping column family with id %u FAILED -- %s\n", @@ -3449,6 +4168,10 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, Iterator* DBImpl::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); @@ -3476,10 +4199,12 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, #else SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); auto iter = new ForwardIterator(this, read_options, cfd, sv); - return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, + return NewDBIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), iter, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_options.iterate_upper_bound); + sv->version_number, read_options.iterate_upper_bound, + read_options.prefix_same_as_start, read_options.pin_data); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); @@ -3534,11 +4259,12 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), - snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_options.iterate_upper_bound); + env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_options.iterate_upper_bound, + read_options.prefix_same_as_start, read_options.pin_data); - Iterator* internal_iter = + InternalIterator* internal_iter = NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); @@ -3552,6 +4278,10 @@ Status DBImpl::NewIterators( const ReadOptions& read_options, const std::vector& column_families, std::vector* iterators) { + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } iterators->clear(); iterators->reserve(column_families.size()); XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new, @@ -3582,10 +4312,11 @@ Status DBImpl::NewIterators( auto cfd = reinterpret_cast(cfh)->cfd(); SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); auto iter = new ForwardIterator(this, read_options, cfd, sv); - iterators->push_back( - NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations)); + iterators->push_back(NewDBIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, nullptr, false, read_options.pin_data)); } #endif } else { @@ -3604,9 +4335,10 @@ Status DBImpl::NewIterators( ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations); - Iterator* internal_iter = NewInternalIterator( - read_options, cfd, sv, db_iter->GetArena()); + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, nullptr, false, read_options.pin_data); + InternalIterator* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } @@ -3615,7 +4347,15 @@ Status DBImpl::NewIterators( return Status::OK(); } -const Snapshot* DBImpl::GetSnapshot() { +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } + +#ifndef ROCKSDB_LITE +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); +} +#endif // ROCKSDB_LITE + +const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) { int64_t unix_time = 0; env_->GetCurrentTime(&unix_time); // Ignore error SnapshotImpl* s = new SnapshotImpl; @@ -3626,7 +4366,8 @@ const Snapshot* DBImpl::GetSnapshot() { delete s; return nullptr; } - return snapshots_.New(s, versions_->LastSequence(), unix_time); + return snapshots_.New(s, versions_->LastSequence(), unix_time, + is_write_conflict_boundary); } void DBImpl::ReleaseSnapshot(const Snapshot* s) { @@ -3666,19 +4407,21 @@ Status DBImpl::SingleDelete(const WriteOptions& write_options, } Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { - return WriteImpl(write_options, my_batch, nullptr); + return WriteImpl(write_options, my_batch, nullptr, nullptr); } #ifndef ROCKSDB_LITE Status DBImpl::WriteWithCallback(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback) { - return WriteImpl(write_options, my_batch, callback); + return WriteImpl(write_options, my_batch, callback, nullptr); } #endif // ROCKSDB_LITE Status DBImpl::WriteImpl(const WriteOptions& write_options, - WriteBatch* my_batch, WriteCallback* callback) { + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable) { if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } @@ -3687,7 +4430,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } Status status; - bool callback_failed = false; bool xfunc_attempted_write = false; XFUNC_TEST("transaction", "transaction_xftest_write_impl", @@ -3704,9 +4446,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, w.batch = my_batch; w.sync = write_options.sync; w.disableWAL = write_options.disableWAL; + w.disable_memtable = disable_memtable; w.in_batch_group = false; - w.done = false; - w.has_callback = (callback != nullptr) ? true : false; + w.callback = callback; + w.log_ref = log_ref; if (!write_options.disableWAL) { RecordTick(stats_, WRITE_WITH_WAL); @@ -3715,12 +4458,46 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE); write_thread_.JoinBatchGroup(&w); - if (w.done) { - // write was done by someone else, no need to grab mutex + if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) { + // we are a non-leader in a parallel group + PERF_TIMER_GUARD(write_memtable_time); + + if (log_used != nullptr) { + *log_used = w.log_used; + } + + if (w.ShouldWriteToMemtable()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + WriteBatchInternal::SetSequence(w.batch, w.sequence); + w.status = WriteBatchInternal::InsertInto( + &w, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/); + } + + if (write_thread_.CompleteParallelWorker(&w)) { + // we're responsible for early exit + auto last_sequence = w.parallel_group->last_sequence; + SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); + versions_->SetLastSequence(last_sequence); + write_thread_.EarlyExitParallelGroup(&w); + } + assert(w.state == WriteThread::STATE_COMPLETED); + // STATE_COMPLETED conditional below handles exit + + status = w.FinalStatus(); + } + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + // write is complete and leader has updated sequence RecordTick(stats_, WRITE_DONE_BY_OTHER); - return w.status; + return w.FinalStatus(); } // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); WriteContext context; mutex_.Lock(); @@ -3742,9 +4519,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) ? 4 * max_total_in_memory_state_ : db_options_.max_total_wal_size; - if (UNLIKELY(!single_column_family_mode_) && - alive_log_files_.begin()->getting_flushed == false && - total_log_size_ > max_total_wal_size) { + if (UNLIKELY(!single_column_family_mode_ && + alive_log_files_.begin()->getting_flushed == false && + total_log_size_ > max_total_wal_size)) { uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; alive_log_files_.begin()->getting_flushed = true; Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, @@ -3767,27 +4544,44 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } } MaybeScheduleFlushOrCompaction(); - } else if (UNLIKELY(write_buffer_.ShouldFlush())) { + } else if (UNLIKELY(write_buffer_manager_->ShouldFlush())) { + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, - "Flushing all column families. Write buffer is using %" PRIu64 - " bytes out of a total of %" PRIu64 ".", - write_buffer_.memory_usage(), write_buffer_.buffer_size()); + "Flushing column family with largest mem table size. Write buffer is " + "using %" PRIu64 " bytes out of a total of %" PRIu64 ".", + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size()); // no need to refcount because drop is happening in write thread, so can't // happen while we're in the write thread + ColumnFamilyData* largest_cfd = nullptr; + size_t largest_cfd_size = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->IsDropped()) { continue; } if (!cfd->mem()->IsEmpty()) { - status = SwitchMemtable(cfd, &context); - if (!status.ok()) { - break; + // We only consider active mem table, hoping immutable memtable is + // already in the process of flushing. + size_t cfd_size = cfd->mem()->ApproximateMemoryUsage(); + if (largest_cfd == nullptr || cfd_size > largest_cfd_size) { + largest_cfd = cfd; + largest_cfd_size = cfd_size; } - cfd->imm()->FlushRequested(); - SchedulePendingFlush(cfd); } } - MaybeScheduleFlushOrCompaction(); + if (largest_cfd != nullptr) { + status = SwitchMemtable(largest_cfd, &context); + if (status.ok()) { + largest_cfd->imm()->FlushRequested(); + SchedulePendingFlush(largest_cfd); + MaybeScheduleFlushOrCompaction(); + } + } } if (UNLIKELY(status.ok() && !bg_error_.ok())) { @@ -3798,8 +4592,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, status = ScheduleFlushes(&context); } - if (UNLIKELY(status.ok()) && - (write_controller_.IsStopped() || write_controller_.NeedsDelay())) { + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || + write_controller_.NeedsDelay()))) { PERF_TIMER_STOP(write_pre_and_post_process_time); PERF_TIMER_GUARD(write_delay_time); // We don't know size of curent batch so that we always use the size @@ -3812,14 +4606,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, uint64_t last_sequence = versions_->LastSequence(); WriteThread::Writer* last_writer = &w; - autovector write_batch_group; + autovector write_group; bool need_log_sync = !write_options.disableWAL && write_options.sync; bool need_log_dir_sync = need_log_sync && !log_dir_synced_; if (status.ok()) { - last_batch_group_size_ = write_thread_.EnterAsBatchGroupLeader( - &w, &last_writer, &write_batch_group); - if (need_log_sync) { while (logs_.front().getting_synced) { log_sync_cv_.Wait(); @@ -3834,147 +4625,233 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // during this phase since &w is currently responsible for logging // and protects against concurrent loggers and concurrent writes // into memtables - - mutex_.Unlock(); - - if (callback != nullptr) { - // If this write has a validation callback, check to see if this write - // is able to be written. Must be called on the write thread. - status = callback->Callback(this); - callback_failed = true; - } - } else { - mutex_.Unlock(); } + mutex_.Unlock(); + // At this point the mutex is unlocked + bool exit_completed_early = false; + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); + if (status.ok()) { - WriteBatch* updates = nullptr; - if (write_batch_group.size() == 1) { - updates = write_batch_group[0]; + // Rules for when we can update the memtable concurrently + // 1. supported by memtable + // 2. Puts are not okay if inplace_update_support + // 3. Deletes or SingleDeletes are not okay if filtering deletes + // (controlled by both batch and memtable setting) + // 4. Merges are not okay + // + // Rules 1..3 are enforced by checking the options + // during startup (CheckConcurrentWritesSupported), so if + // options.allow_concurrent_memtable_write is true then they can be + // assumed to be true. Rule 4 is checked for each batch. We could + // relax rules 2 and 3 if we could prevent write batches from referring + // more than once to a particular key. + bool parallel = + db_options_.allow_concurrent_memtable_write && write_group.size() > 1; + int total_count = 0; + uint64_t total_byte_size = 0; + for (auto writer : write_group) { + if (writer->CheckCallback(this)) { + if (writer->ShouldWriteToMemtable()) { + total_count += WriteBatchInternal::Count(writer->batch); + parallel = parallel && !writer->batch->HasMerge(); + } + + if (writer->ShouldWriteToWAL()) { + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + } + } + } + + const SequenceNumber current_sequence = last_sequence + 1; + last_sequence += total_count; + + // Record statistics + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size); + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (write_options.disableWAL) { + has_unpersisted_data_ = true; + } + + uint64_t log_size = 0; + if (!write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + + WriteBatch* merged_batch = nullptr; + if (write_group.size() == 1 && write_group[0]->ShouldWriteToWAL()) { + merged_batch = write_group[0]->batch; + write_group[0]->log_used = logfile_number_; } else { - updates = &tmp_batch_; - for (size_t i = 0; i < write_batch_group.size(); ++i) { - WriteBatchInternal::Append(updates, write_batch_group[i]); + // WAL needs all of the batches flattened into a single batch. + // We could avoid copying here with an iov-like AddRecord + // interface + merged_batch = &tmp_batch_; + for (auto writer : write_group) { + if (writer->ShouldWriteToWAL()) { + WriteBatchInternal::Append(merged_batch, writer->batch); + } + writer->log_used = logfile_number_; } } - const SequenceNumber current_sequence = last_sequence + 1; - WriteBatchInternal::SetSequence(updates, current_sequence); - int my_batch_count = WriteBatchInternal::Count(updates); - last_sequence += my_batch_count; - const uint64_t batch_size = WriteBatchInternal::ByteSize(updates); - // Record statistics - RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count); - RecordTick(stats_, BYTES_WRITTEN, batch_size); - if (write_options.disableWAL) { - flush_on_destroy_ = true; + if (log_used != nullptr) { + *log_used = logfile_number_; } - PERF_TIMER_STOP(write_pre_and_post_process_time); - - uint64_t log_size = 0; - if (!write_options.disableWAL) { - PERF_TIMER_GUARD(write_wal_time); - Slice log_entry = WriteBatchInternal::Contents(updates); - status = logs_.back().writer->AddRecord(log_entry); - total_log_size_ += log_entry.size(); - alive_log_files_.back().AddSize(log_entry.size()); - log_empty_ = false; - log_size = log_entry.size(); - RecordTick(stats_, WAL_FILE_BYTES, log_size); - if (status.ok() && need_log_sync) { - RecordTick(stats_, WAL_FILE_SYNCED); - StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); - // It's safe to access logs_ with unlocked mutex_ here because: - // - we've set getting_synced=true for all logs, - // so other threads won't pop from logs_ while we're here, - // - only writer thread can push to logs_, and we're in - // writer thread, so no one will push to logs_, - // - as long as other threads don't modify it, it's safe to read - // from std::deque from multiple threads concurrently. - for (auto& log : logs_) { - status = log.writer->file()->Sync(db_options_.use_fsync); - if (!status.ok()) { - break; - } + + WriteBatchInternal::SetSequence(merged_batch, current_sequence); + + Slice log_entry = WriteBatchInternal::Contents(merged_batch); + status = logs_.back().writer->AddRecord(log_entry); + total_log_size_ += log_entry.size(); + alive_log_files_.back().AddSize(log_entry.size()); + log_empty_ = false; + log_size = log_entry.size(); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + if (status.ok() && need_log_sync) { + RecordTick(stats_, WAL_FILE_SYNCED); + StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + // It's safe to access logs_ with unlocked mutex_ here because: + // - we've set getting_synced=true for all logs, + // so other threads won't pop from logs_ while we're here, + // - only writer thread can push to logs_, and we're in + // writer thread, so no one will push to logs_, + // - as long as other threads don't modify it, it's safe to read + // from std::deque from multiple threads concurrently. + for (auto& log : logs_) { + status = log.writer->file()->Sync(db_options_.use_fsync); + if (!status.ok()) { + break; } - if (status.ok() && need_log_dir_sync) { - // We only sync WAL directory the first time WAL syncing is - // requested, so that in case users never turn on WAL sync, - // we can avoid the disk I/O in the write code path. - status = directories_.GetWalDir()->Fsync(); + } + if (status.ok() && need_log_dir_sync) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + status = directories_.GetWalDir()->Fsync(); + } + } + + if (merged_batch == &tmp_batch_) { + tmp_batch_.Clear(); + } + } + if (status.ok()) { + PERF_TIMER_GUARD(write_memtable_time); + + { + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early in + // some cases. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count); + if (!write_options.disableWAL) { + if (write_options.sync) { + stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1); + } + stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size); + } + uint64_t for_other = write_group.size() - 1; + if (for_other > 0) { + stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, for_other); + if (!write_options.disableWAL) { + stats->AddDBStats(InternalStats::WRITE_WITH_WAL, for_other); } } } - if (status.ok()) { - PERF_TIMER_GUARD(write_memtable_time); + if (!parallel) { status = WriteBatchInternal::InsertInto( - updates, column_family_memtables_.get(), - write_options.ignore_missing_column_families, 0, this, false); - // A non-OK status here indicates iteration failure (either in-memory - // writebatch corruption (very bad), or the client specified invalid - // column family). This will later on trigger bg_error_. - // - // Note that existing logic was not sound. Any partial failure writing - // into the memtable would result in a state that some write ops might - // have succeeded in memtable but Status reports error for all writes. + write_group, current_sequence, column_family_memtables_.get(), + &flush_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this); + + if (status.ok()) { + // There were no write failures. Set leader's status + // in case the write callback returned a non-ok status. + status = w.FinalStatus(); + } - SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); - } - PERF_TIMER_START(write_pre_and_post_process_time); - if (updates == &tmp_batch_) { - tmp_batch_.Clear(); + } else { + WriteThread::ParallelGroup pg; + pg.leader = &w; + pg.last_writer = last_writer; + pg.last_sequence = last_sequence; + pg.early_exit_allowed = !need_log_sync; + pg.running.store(static_cast(write_group.size()), + std::memory_order_relaxed); + write_thread_.LaunchParallelFollowers(&pg, current_sequence); + + if (w.ShouldWriteToMemtable()) { + // do leader write + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + WriteBatchInternal::SetSequence(w.batch, w.sequence); + w.status = WriteBatchInternal::InsertInto( + &w, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*concurrent_memtable_writes*/); + } + + // CompleteParallelWorker returns true if this thread should + // handle exit, false means somebody else did + exit_completed_early = !write_thread_.CompleteParallelWorker(&w); + status = w.FinalStatus(); } - mutex_.Lock(); - // internal stats - default_cf_internal_stats_->AddDBStats( - InternalStats::BYTES_WRITTEN, batch_size); - default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, - my_batch_count); - if (!write_options.disableWAL) { - if (write_options.sync) { - default_cf_internal_stats_->AddDBStats(InternalStats::WAL_FILE_SYNCED, - 1); + if (!exit_completed_early && w.status.ok()) { + SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); + versions_->SetLastSequence(last_sequence); + if (!need_log_sync) { + write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); + exit_completed_early = true; } - default_cf_internal_stats_->AddDBStats( - InternalStats::WAL_FILE_BYTES, log_size); } - if (status.ok()) { - versions_->SetLastSequence(last_sequence); + + // A non-OK status here indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be + // because of a corrupt write_batch (very bad), or because the + // client specified an invalid column family and didn't specify + // ignore_missing_column_families. + // + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if (!status.ok() && bg_error_.ok() && !w.CallbackFailed()) { + bg_error_ = status; } - } else { - // Operation failed. Make sure sure mutex is held for cleanup code below. - mutex_.Lock(); + } } + PERF_TIMER_START(write_pre_and_post_process_time); - if (db_options_.paranoid_checks && !status.ok() && !callback_failed && - !status.IsBusy() && bg_error_.ok()) { - bg_error_ = status; // stop compaction & fail any further writes + if (db_options_.paranoid_checks && !status.ok() && !w.CallbackFailed() && + !status.IsBusy()) { + mutex_.Lock(); + if (bg_error_.ok()) { + bg_error_ = status; // stop compaction & fail any further writes + } + mutex_.Unlock(); } - mutex_.AssertHeld(); - if (need_log_sync) { + mutex_.Lock(); MarkLogsSynced(logfile_number_, need_log_dir_sync, status); + mutex_.Unlock(); } - uint64_t writes_for_other = write_batch_group.size() - 1; - if (writes_for_other > 0) { - default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, - writes_for_other); - if (!write_options.disableWAL) { - default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, - writes_for_other); - } + if (!exit_completed_early) { + write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); } - mutex_.Unlock(); - - write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status); - return status; } @@ -4012,7 +4889,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes) { Status DBImpl::ScheduleFlushes(WriteContext* context) { ColumnFamilyData* cfd; - while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { auto status = SwitchMemtable(cfd, context); if (cfd->Unref()) { delete cfd; @@ -4024,6 +4901,22 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { return Status::OK(); } +#ifndef ROCKSDB_LITE +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info) { + if (db_options_.listeners.size() == 0U) { + return; + } + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + for (auto listener : db_options_.listeners) { + listener->OnMemTableSealed(mem_table_info); + } +} +#endif // ROCKSDB_LITE + // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { @@ -4036,27 +4929,57 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // Do this without holding the dbmutex lock. assert(versions_->prev_log_number() == 0); bool creating_new_log = !log_empty_; + uint64_t recycle_log_number = 0; + if (creating_new_log && db_options_.recycle_log_file_num && + !log_recycle_files.empty()) { + recycle_log_number = log_recycle_files.front(); + log_recycle_files.pop_front(); + } uint64_t new_log_number = creating_new_log ? versions_->NewFileNumber() : logfile_number_; SuperVersion* new_superversion = nullptr; const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Set current_memtble_info for memtable sealed callback +#ifndef ROCKSDB_LITE + MemTableInfo memtable_info; + memtable_info.cf_name = cfd->GetName(); + memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); + memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.num_entries = cfd->mem()->num_entries(); + memtable_info.num_deletes = cfd->mem()->num_deletes(); +#endif // ROCKSDB_LITE + // Log this later after lock release. It may be outdated, e.g., if background + // flush happens before logging, but that should be ok. + int num_imm_unflushed = cfd->imm()->NumNotFlushed(); mutex_.Unlock(); Status s; { if (creating_new_log) { EnvOptions opt_env_opt = env_->OptimizeForLogWrite(env_options_, db_options_); - s = env_->NewWritableFile( - LogFileName(db_options_.wal_dir, new_log_number), &lfile, - opt_env_opt); + if (recycle_log_number) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", recycle_log_number); + s = env_->ReuseWritableFile( + LogFileName(db_options_.wal_dir, new_log_number), + LogFileName(db_options_.wal_dir, recycle_log_number), &lfile, + opt_env_opt); + } else { + s = NewWritableFile(env_, + LogFileName(db_options_.wal_dir, new_log_number), + &lfile, opt_env_opt); + } if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. lfile->SetPreallocationBlockSize( - 1.1 * mutable_cf_options.write_buffer_size); + mutable_cf_options.write_buffer_size / 10 + + mutable_cf_options.write_buffer_size); unique_ptr file_writer( new WritableFileWriter(std::move(lfile), opt_env_opt)); - new_log = new log::Writer(std::move(file_writer)); + new_log = new log::Writer(std::move(file_writer), new_log_number, + db_options_.recycle_log_file_num > 0); } } @@ -4065,10 +4988,18 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); new_superversion = new SuperVersion(); } + +#ifndef ROCKSDB_LITE + // PLEASE NOTE: We assume that there are no failable operations + // after lock is acquired below since we are already notifying + // client about mem table becoming immutable. + NotifyOnMemTableSealed(cfd, memtable_info); +#endif //ROCKSDB_LITE } - Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, - "[%s] New memtable created with log file: #%" PRIu64 "\n", - cfd->GetName().c_str(), new_log_number); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 + ". Immutable memtables: %d.\n", + cfd->GetName().c_str(), new_log_number, num_imm_unflushed); mutex_.Lock(); if (!s.ok()) { // how do we fail if we're not creating new log? @@ -4125,6 +5056,29 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, return s; } + +Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, + const Range* range, std::size_t n, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfTablesInRange(range, n, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + #endif // ROCKSDB_LITE const std::string& DBImpl::GetName() const { @@ -4144,63 +5098,95 @@ const DBOptions& DBImpl::GetDBOptions() const { return db_options_; } bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) { - bool is_int_property = false; - bool need_out_of_mutex = false; - DBPropertyType property_type = - GetPropertyType(property, &is_int_property, &need_out_of_mutex); - + const DBPropertyInfo* property_info = GetPropertyInfo(property); value->clear(); - if (is_int_property) { + auto cfd = reinterpret_cast(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_int) { uint64_t int_value; - bool ret_value = GetIntPropertyInternal(column_family, property_type, - need_out_of_mutex, &int_value); + bool ret_value = + GetIntPropertyInternal(cfd, *property_info, false, &int_value); if (ret_value) { *value = ToString(int_value); } return ret_value; - } else { - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); + } else if (property_info->handle_string) { InstrumentedMutexLock l(&mutex_); - return cfd->internal_stats()->GetStringProperty(property_type, property, + return cfd->internal_stats()->GetStringProperty(*property_info, property, value); } + // Shouldn't reach here since exactly one of handle_string and handle_int + // should be non-nullptr. + assert(false); + return false; } bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) { - bool is_int_property = false; - bool need_out_of_mutex = false; - DBPropertyType property_type = - GetPropertyType(property, &is_int_property, &need_out_of_mutex); - if (!is_int_property) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { return false; } - return GetIntPropertyInternal(column_family, property_type, need_out_of_mutex, - value); + auto cfd = reinterpret_cast(column_family)->cfd(); + return GetIntPropertyInternal(cfd, *property_info, false, value); } -bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, - DBPropertyType property_type, - bool need_out_of_mutex, uint64_t* value) { - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); - - if (!need_out_of_mutex) { - InstrumentedMutexLock l(&mutex_); - return cfd->internal_stats()->GetIntProperty(property_type, value, this); +bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value) { + assert(property_info.handle_int != nullptr); + if (!property_info.need_out_of_mutex) { + if (is_locked) { + mutex_.AssertHeld(); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } } else { - SuperVersion* sv = GetAndRefSuperVersion(cfd); + SuperVersion* sv = nullptr; + if (!is_locked) { + sv = GetAndRefSuperVersion(cfd); + } else { + sv = cfd->GetSuperVersion(); + } bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( - property_type, sv->current, value); + property_info, sv->current, value); - ReturnAndCleanupSuperVersion(cfd, sv); + if (!is_locked) { + ReturnAndCleanupSuperVersion(cfd, sv); + } return ret; } } +bool DBImpl::GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + + uint64_t sum = 0; + { + // Needs mutex to protect the list of column families. + InstrumentedMutexLock l(&mutex_); + uint64_t value; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { + sum += value; + } else { + return false; + } + } + } + *aggregated_value = sum; + return true; +} + SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { // TODO(ljin): consider using GetReferencedSuperVersion() directly return cfd->GetThreadLocalSuperVersion(&mutex_); @@ -4452,6 +5438,88 @@ Status DBImpl::DeleteFile(std::string name) { return status; } +Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + Status status; + auto cfh = reinterpret_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + VersionEdit edit; + std::vector deleted_files; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + Version* input_version = cfd->current(); + + auto* vstorage = input_version->storage_info(); + for (int i = 1; i < cfd->NumberLevels(); i++) { + if (vstorage->LevelFiles(i).empty() || + !vstorage->OverlapInLevel(i, begin, end)) { + continue; + } + std::vector level_files; + InternalKey begin_storage, end_storage, *begin_key, *end_key; + if (begin == nullptr) { + begin_key = nullptr; + } else { + begin_storage.SetMaxPossibleForUserKey(*begin); + begin_key = &begin_storage; + } + if (end == nullptr) { + end_key = nullptr; + } else { + end_storage.SetMinPossibleForUserKey(*end); + end_key = &end_storage; + } + + vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1, + nullptr, false); + FileMetaData* level_file; + for (uint32_t j = 0; j < level_files.size(); j++) { + level_file = level_files[j]; + if (((begin == nullptr) || + (cfd->internal_comparator().user_comparator()->Compare( + level_file->smallest.user_key(), *begin) >= 0)) && + ((end == nullptr) || + (cfd->internal_comparator().user_comparator()->Compare( + level_file->largest.user_key(), *end) <= 0))) { + if (level_file->being_compacted) { + continue; + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.push_back(level_file); + level_file->being_compacted = true; + } + } + } + if (edit.GetDeletedFiles().empty()) { + job_context.Clean(); + return Status::OK(); + } + input_version->Ref(); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWorkWrapper( + cfd, &job_context, *cfd->GetLatestMutableCFOptions()); + } + for (auto* deleted_file : deleted_files) { + deleted_file->being_compacted = false; + } + input_version->Unref(); + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { InstrumentedMutexLock l(&mutex_); versions_->GetLiveFilesMetaData(metadata); @@ -4577,6 +5645,10 @@ Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options, Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) { return Status::NotSupported(""); } +Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { + delete column_family; + return Status::OK(); +} DB::~DB() { } @@ -4605,24 +5677,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, return s; } - for (auto& cfd : column_families) { - s = CheckCompressionSupported(cfd.options); - if (!s.ok()) { - return s; - } - if (db_options.db_paths.size() > 1) { - if ((cfd.options.compaction_style != kCompactionStyleUniversal) && - (cfd.options.compaction_style != kCompactionStyleLevel)) { - return Status::NotSupported( - "More than one DB paths are only supported in " - "universal and level compaction styles. "); - } - } - } - - if (db_options.db_paths.size() > 4) { - return Status::NotSupported( - "More than four DB paths are not supported yet. "); + s = ValidateOptions(db_options, column_families); + if (!s.ok()) { + return s; } *dbptr = nullptr; @@ -4664,16 +5721,18 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, EnvOptions soptions(db_options); EnvOptions opt_env_options = impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_); - s = impl->db_options_.env->NewWritableFile( - LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, - opt_env_options); + s = NewWritableFile(impl->db_options_.env, + LogFileName(impl->db_options_.wal_dir, new_log_number), + &lfile, opt_env_options); if (s.ok()) { - lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); + lfile->SetPreallocationBlockSize((max_write_buffer_size / 10) + max_write_buffer_size); impl->logfile_number_ = new_log_number; unique_ptr file_writer( new WritableFileWriter(std::move(lfile), opt_env_options)); - impl->logs_.emplace_back(new_log_number, - new log::Writer(std::move(file_writer))); + impl->logs_.emplace_back( + new_log_number, + new log::Writer(std::move(file_writer), new_log_number, + impl->db_options_.recycle_log_file_num > 0)); // set column family handles for (auto cf : column_families) { @@ -4743,24 +5802,59 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } } TEST_SYNC_POINT("DBImpl::Open:Opened"); + Status persist_options_status; if (s.ok()) { + // Persist RocksDB Options before scheduling the compaction. + // The WriteOptionsFile() will release and lock the mutex internally. + persist_options_status = impl->WriteOptionsFile(); + + *dbptr = impl; impl->opened_successfully_ = true; impl->MaybeScheduleFlushOrCompaction(); } impl->mutex_.Unlock(); + auto sfm = static_cast( + impl->db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] when the DB is opened. + auto& db_path = impl->db_options_.db_paths[0]; + std::vector existing_files; + impl->db_options_.env->GetChildren(db_path.path, &existing_files); + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = db_path.path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + file_type == kTableFile) { + sfm->OnAddFile(file_path); + } + } + } + if (s.ok()) { Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p", impl); LogFlush(impl->db_options_.info_log); - - *dbptr = impl; - } else { + if (!persist_options_status.ok()) { + if (db_options.fail_if_options_file_error) { + s = Status::IOError( + "DB::Open() failed --- Unable to persist Options file", + persist_options_status.ToString()); + } + Warn(impl->db_options_.info_log, + "Unable to persist options in DB::Open() -- %s", + persist_options_status.ToString().c_str()); + } + } + if (!s.ok()) { for (auto* h : *handles) { delete h; } handles->clear(); delete impl; + *dbptr = nullptr; } return s; } @@ -4798,7 +5892,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) { if (type == kMetaDatabase) { del = DestroyDB(path_to_delete, options); } else if (type == kTableFile) { - del = DeleteOrMoveToTrash(&options, path_to_delete); + del = DeleteSSTFile(&options, path_to_delete, 0); } else { del = env->DeleteFile(path_to_delete); } @@ -4814,13 +5908,9 @@ Status DestroyDB(const std::string& dbname, const Options& options) { for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { // Lock file will be deleted at end - Status del; std::string table_path = db_path.path + "/" + filenames[i]; - if (path_id == 0) { - del = DeleteOrMoveToTrash(&options, table_path); - } else { - del = env->DeleteFile(table_path); - } + Status del = DeleteSSTFile(&options, table_path, + static_cast(path_id)); if (result.ok() && !del.ok()) { result = del; } @@ -4857,6 +5947,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) { } } } + // ignore case where no archival directory is present. env->DeleteDir(archivedir); @@ -4868,12 +5959,116 @@ Status DestroyDB(const std::string& dbname, const Options& options) { return result; } +Status DBImpl::WriteOptionsFile() { +#ifndef ROCKSDB_LITE + mutex_.AssertHeld(); + + std::vector cf_names; + std::vector cf_opts; + + // This part requires mutex to protect the column family options + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cf_names.push_back(cfd->GetName()); + cf_opts.push_back(BuildColumnFamilyOptions( + *cfd->options(), *cfd->GetLatestMutableCFOptions())); + } + + // Unlock during expensive operations. New writes cannot get here + // because the single write thread ensures all new writes get queued. + mutex_.Unlock(); + + std::string file_name = + TempOptionsFileName(GetName(), versions_->NewFileNumber()); + Status s = PersistRocksDBOptions(GetDBOptions(), cf_names, cf_opts, file_name, + GetEnv()); + + if (s.ok()) { + s = RenameTempFileToOptionsFile(file_name); + } + mutex_.Lock(); + return s; +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +namespace { +void DeleteOptionsFilesHelper(const std::map& filenames, + const size_t num_files_to_keep, + const std::shared_ptr& info_log, + Env* env) { + if (filenames.size() <= num_files_to_keep) { + return; + } + for (auto iter = std::next(filenames.begin(), num_files_to_keep); + iter != filenames.end(); ++iter) { + if (!env->DeleteFile(iter->second).ok()) { + Warn(info_log, "Unable to delete options file %s", iter->second.c_str()); + } + } +} +} // namespace +#endif // !ROCKSDB_LITE + +Status DBImpl::DeleteObsoleteOptionsFiles() { +#ifndef ROCKSDB_LITE + std::vector filenames; + // use ordered map to store keep the filenames sorted from the newest + // to the oldest. + std::map options_filenames; + Status s; + s = GetEnv()->GetChildren(GetName(), &filenames); + if (!s.ok()) { + return s; + } + for (auto& filename : filenames) { + uint64_t file_number; + FileType type; + if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) { + options_filenames.insert( + {std::numeric_limits::max() - file_number, + GetName() + "/" + filename}); + } + } + + // Keeps the latest 2 Options file + const size_t kNumOptionsFilesKept = 2; + DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, + db_options_.info_log, GetEnv()); + return Status::OK(); +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { +#ifndef ROCKSDB_LITE + Status s; + + versions_->options_file_number_ = versions_->NewFileNumber(); + std::string options_file_name = + OptionsFileName(GetName(), versions_->options_file_number_); + // Retry if the file name happen to conflict with an existing one. + s = GetEnv()->RenameFile(file_name, options_file_name); + + DeleteObsoleteOptionsFiles(); + return s; +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + #if ROCKSDB_USING_THREAD_STATUS void DBImpl::NewThreadStatusCfInfo( ColumnFamilyData* cfd) const { if (db_options_.enable_thread_tracking) { - ThreadStatusUtil::NewColumnFamilyInfo(this, cfd); + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(), + cfd->ioptions()->env); } } @@ -4933,20 +6128,20 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, #endif // ROCKSDB_LITE #ifndef ROCKSDB_LITE -Status DBImpl::GetLatestSequenceForKeyFromMemtable(SuperVersion* sv, - const Slice& key, - SequenceNumber* seq) { +Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, SequenceNumber* seq, + bool* found_record_for_key) { Status s; - std::string value; MergeContext merge_context; SequenceNumber current_seq = versions_->LastSequence(); LookupKey lkey(key, current_seq); *seq = kMaxSequenceNumber; + *found_record_for_key = false; // Check if there is a record for this key in the latest memtable - sv->mem->Get(lkey, &value, &s, &merge_context, seq); + sv->mem->Get(lkey, nullptr, &s, &merge_context, seq); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading memtable. @@ -4959,11 +6154,12 @@ Status DBImpl::GetLatestSequenceForKeyFromMemtable(SuperVersion* sv, if (*seq != kMaxSequenceNumber) { // Found a sequence number, no need to check immutable memtables + *found_record_for_key = true; return Status::OK(); } // Check if there is a record for this key in the immutable memtables - sv->imm->Get(lkey, &value, &s, &merge_context, seq); + sv->imm->Get(lkey, nullptr, &s, &merge_context, seq); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading memtable. @@ -4976,11 +6172,12 @@ Status DBImpl::GetLatestSequenceForKeyFromMemtable(SuperVersion* sv, if (*seq != kMaxSequenceNumber) { // Found a sequence number, no need to check memtable history + *found_record_for_key = true; return Status::OK(); } // Check if there is a record for this key in the immutable memtables - sv->imm->GetFromHistory(lkey, &value, &s, &merge_context, seq); + sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, seq); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading memtable. @@ -4991,6 +6188,31 @@ Status DBImpl::GetLatestSequenceForKeyFromMemtable(SuperVersion* sv, return s; } + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check SST files + *found_record_for_key = true; + return Status::OK(); + } + + // TODO(agiardullo): possible optimization: consider checking cached + // SST files if cache_only=true? + if (!cache_only) { + // Check tables + ReadOptions read_options; + + sv->current->Get(read_options, lkey, nullptr, &s, &merge_context, + nullptr /* value_found */, found_record_for_key, seq); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading SST files + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Unexpected status returned from Version::Get: %s\n", + s.ToString().c_str()); + + return s; + } + } + return Status::OK(); } #endif // ROCKSDB_LITE diff --git a/external/rocksdb/db/db_impl.h b/external/rocksdb/db/db_impl.h index d7cc9db95c..3d0eab55cb 100644 --- a/external/rocksdb/db/db_impl.h +++ b/external/rocksdb/db/db_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,8 +10,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -29,18 +31,18 @@ #include "db/wal_manager.h" #include "db/write_controller.h" #include "db/write_thread.h" -#include "db/writebuffer.h" #include "memtable_list.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/event_logger.h" #include "util/hash.h" #include "util/instrumented_mutex.h" -#include "util/scoped_arena_iterator.h" #include "util/stop_watch.h" #include "util/thread_local.h" @@ -55,6 +57,7 @@ class Arena; class WriteCallback; struct JobContext; struct ExternalSstFileInfo; +struct MemTableInfo; class DBImpl : public DB { public: @@ -122,6 +125,9 @@ class DBImpl : public DB { using DB::GetIntProperty; virtual bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) override; + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) override; using DB::GetApproximateSizes; virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* range, int n, uint64_t* sizes, @@ -141,6 +147,9 @@ class DBImpl : public DB { virtual Status PauseBackgroundWork() override; virtual Status ContinueBackgroundWork() override; + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override; + using DB::SetOptions; Status SetOptions( ColumnFamilyHandle* column_family, @@ -182,6 +191,8 @@ class DBImpl : public DB { const TransactionLogIterator::ReadOptions& read_options = TransactionLogIterator::ReadOptions()) override; virtual Status DeleteFile(std::string name) override; + Status DeleteFilesInRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); virtual void GetLiveFilesMetaData( std::vector* metadata) override; @@ -220,23 +231,47 @@ class DBImpl : public DB { bool include_history); // For a given key, check to see if there are any records for this key - // in the memtables, including memtable history. - - // On success, *seq will contain the sequence number for the - // latest such change or kMaxSequenceNumber if no records were present. - // Returns OK on success, other status on error reading memtables. - Status GetLatestSequenceForKeyFromMemtable(SuperVersion* sv, const Slice& key, - SequenceNumber* seq); + // in the memtables, including memtable history. If cache_only is false, + // SST files will also be checked. + // + // If a key is found, *found_record_for_key will be set to true and + // *seq will be set to the stored sequence number for the latest + // operation on this key or kMaxSequenceNumber if unknown. + // If no key is found, *found_record_for_key will be set to false. + // + // Note: If cache_only=false, it is possible for *seq to be set to 0 if + // the sequence number has been cleared from the record. If the caller is + // holding an active db snapshot, we know the missing sequence must be less + // than the snapshot's sequence number (sequence numbers are only cleared + // when there are no earlier active snapshots). + // + // If NotFound is returned and found_record_for_key is set to false, then no + // record for this key was found. If the caller is holding an active db + // snapshot, we know that no key could have existing after this snapshot + // (since we do not compact keys that have an earlier snapshot). + // + // Returns OK or NotFound on success, + // other status on unexpected error. + Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, SequenceNumber* seq, + bool* found_record_for_key); using DB::AddFile; virtual Status AddFile(ColumnFamilyHandle* column_family, - const ExternalSstFileInfo* file_info, + const std::vector& file_info_list, bool move_file) override; virtual Status AddFile(ColumnFamilyHandle* column_family, - const std::string& file_path, bool move_file) override; + const std::vector& file_path_list, + bool move_file) override; #endif // ROCKSDB_LITE + // Similar to GetSnapshot(), but also lets the db know that this snapshot + // will be used for transaction write-conflict checking. The DB can then + // make sure not to compact any keys that would prevent a write-conflict from + // being detected. + const Snapshot* GetSnapshotForWriteConflictBoundary(); + // checks if all live files exist on file system and that their file sizes // match to our in-memory records virtual Status CheckConsistency(); @@ -246,9 +281,16 @@ class DBImpl : public DB { Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, int output_level, uint32_t output_path_id, const Slice* begin, const Slice* end, + bool exclusive, bool disallow_trivial_move = false); -#ifndef ROCKSDB_LITE + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + InternalIterator* NewInternalIterator( + Arena* arena, ColumnFamilyHandle* column_family = nullptr); + +#ifndef NDEBUG // Extra methods (for testing) that are not in the public DB interface // Implemented in db_impl_debug.cc @@ -258,7 +300,8 @@ class DBImpl : public DB { bool disallow_trivial_move = false); // Force current memtable contents to be flushed. - Status TEST_FlushMemTable(bool wait = true); + Status TEST_FlushMemTable(bool wait = true, + ColumnFamilyHandle* cfh = nullptr); // Wait for memtable compaction Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); @@ -266,12 +309,6 @@ class DBImpl : public DB { // Wait for any compaction Status TEST_WaitForCompact(); - // Return an internal iterator over the current state of the database. - // The keys of this iterator are internal keys (see format.h). - // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator( - Arena* arena, ColumnFamilyHandle* column_family = nullptr); - // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family = @@ -305,7 +342,32 @@ class DBImpl : public DB { uint64_t TEST_LogfileNumber(); -#endif // ROCKSDB_LITE + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_opitons); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + +#endif // NDEBUG + + // Return maximum background compaction allowed to be scheduled based on + // compaction status. + int BGCompactionsAllowed() const; + + // move logs pending closing from job_context to the DB queue and + // schedule a purge + void ScheduleBgLogWriterClose(JobContext* job_context); // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. @@ -319,7 +381,10 @@ class DBImpl : public DB { // belong to live files are posibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. - void PurgeObsoleteFiles(const JobContext& background_contet); + void PurgeObsoleteFiles(const JobContext& background_contet, + bool schedule_only = false); + + void SchedulePurge(); ColumnFamilyHandle* DefaultColumnFamily() const override; @@ -363,24 +428,116 @@ class DBImpl : public DB { // Same as above, should called without mutex held and not on write thread. ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id); + // Returns the number of currently running flushes. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_flushes() { + mutex_.AssertHeld(); + return num_running_flushes_; + } + + // Returns the number of currently running compactions. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_compactions() { + mutex_.AssertHeld(); + return num_running_compactions_; + } + + // hollow transactions shell used for recovery. + // these will then be passed to TransactionDB so that + // locks can be reacquired before writing can resume. + struct RecoveredTransaction { + uint64_t log_number_; + std::string name_; + WriteBatch* batch_; + explicit RecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch) + : log_number_(log), name_(name), batch_(batch) {} + + ~RecoveredTransaction() { delete batch_; } + }; + + bool allow_2pc() const { return db_options_.allow_2pc; } + + std::unordered_map + recovered_transactions() { + return recovered_transactions_; + } + + RecoveredTransaction* GetRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + if (it == recovered_transactions_.end()) { + return nullptr; + } else { + return it->second; + } + } + + void InsertRecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch) { + recovered_transactions_[name] = new RecoveredTransaction(log, name, batch); + MarkLogAsContainingPrepSection(log); + } + + void DeleteRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + assert(it != recovered_transactions_.end()); + auto* trx = it->second; + recovered_transactions_.erase(it); + MarkLogAsHavingPrepSectionFlushed(trx->log_number_); + delete trx; + } + + void DeleteAllRecoveredTransactions() { + for (auto it = recovered_transactions_.begin(); + it != recovered_transactions_.end(); it++) { + delete it->second; + } + recovered_transactions_.clear(); + } + + void MarkLogAsHavingPrepSectionFlushed(uint64_t log); + void MarkLogAsContainingPrepSection(uint64_t log); + void AddToLogsToFreeQueue(log::Writer* log_writer) { + logs_to_free_queue_.push_back(log_writer); + } + + Status NewDB(); + protected: Env* const env_; const std::string dbname_; unique_ptr versions_; const DBOptions db_options_; Statistics* stats_; + std::unordered_map + recovered_transactions_; + + InternalIterator* NewInternalIterator(const ReadOptions&, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena); - Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, - SuperVersion* super_version, Arena* arena); + // Except in DB::Open(), WriteOptionsFile can only be called when: + // 1. WriteThread::Writer::EnterUnbatched() is used. + // 2. db_mutex is held + Status WriteOptionsFile(); + + // The following two functions can only be called when: + // 1. WriteThread::Writer::EnterUnbatched() is used. + // 2. db_mutex is NOT held + Status RenameTempFileToOptionsFile(const std::string& file_name); + Status DeleteObsoleteOptionsFiles(); void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id); + int job_id, TableProperties prop); void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction *c, const Status &st, const CompactionJobStats& job_stats, int job_id); + void NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info); void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; @@ -389,11 +546,17 @@ class DBImpl : public DB { void EraseThreadStatusDbInfo() const; Status WriteImpl(const WriteOptions& options, WriteBatch* updates, - WriteCallback* callback); + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false); + + uint64_t FindMinLogContainingOutstandingPrep(); + uint64_t FindMinPrepLogReferencedByMemTable(); private: friend class DB; friend class InternalStats; + friend class TransactionImpl; #ifndef ROCKSDB_LITE friend class ForwardIterator; #endif @@ -406,13 +569,14 @@ class DBImpl : public DB { struct WriteContext; - Status NewDB(); + struct PurgeFileInfo; // Recover the descriptor from persistent storage. May do a significant // amount of work to recover recently logged updates. Any changes to // be made to the descriptor are added to *edit. Status Recover(const std::vector& column_families, - bool read_only = false, bool error_if_log_file_exist = false); + bool read_only = false, bool error_if_log_file_exist = false, + bool error_if_data_exists_in_logs = false); void MaybeIgnoreError(Status* s) const; @@ -420,13 +584,18 @@ class DBImpl : public DB { // Delete any unneeded files and stale in-memory entries. void DeleteObsoleteFiles(); + // Delete obsolete files and log status and information of file deletion + void DeleteObsoleteFileImpl(Status file_deletion_status, int job_id, + const std::string& fname, FileType type, + uint64_t number, uint32_t path_id); // Background process needs to call // auto x = CaptureCurrentFileNumberInPendingOutputs() + // auto file_num = versions_->NewFileNumber(); // // ReleaseFileNumberFromPendingOutputs(x) - // This will protect any temporary files created while is - // executing from being deleted. + // This will protect any file with number `file_num` or greater from being + // deleted while is running. // ----------- // This function will capture current file number and append it to // pending_outputs_. This will prevent any background process to delete any @@ -439,6 +608,8 @@ class DBImpl : public DB { // and blocked by any other pending_outputs_ calls) void ReleaseFileNumberFromPendingOutputs(std::list::iterator v); + Status SyncClosedLogs(JobContext* job_context); + // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, @@ -451,7 +622,7 @@ class DBImpl : public DB { SequenceNumber* max_sequence, bool read_only); // The following two methods are used to flush a memtable to - // storage. The first one is used atdatabase RecoveryTime (when the + // storage. The first one is used at database RecoveryTime (when the // database is opened) and is heavyweight because it holds the mutex // for the entire period. The second method WriteLevel0Table supports // concurrent flush memtables to storage. @@ -472,15 +643,21 @@ class DBImpl : public DB { // Wait for memtable flushed Status WaitForFlushMemTable(ColumnFamilyData* cfd); - void RecordFlushIOStats(); - void RecordCompactionIOStats(); - #ifndef ROCKSDB_LITE + // Finds the lowest level in the DB that the ingested file can be added to + // REQUIRES: mutex_ held + int PickLevelForIngestedFile(ColumnFamilyData* cfd, + const ExternalSstFileInfo& file_info); + Status CompactFilesImpl( const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, const int output_level, int output_path_id, JobContext* job_context, LogBuffer* log_buffer); + Status ReadExternalSstFileInfo(ColumnFamilyHandle* column_family, + const std::string& file_path, + ExternalSstFileInfo* file_info); + #endif // ROCKSDB_LITE ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); @@ -488,12 +665,17 @@ class DBImpl : public DB { void MaybeScheduleFlushOrCompaction(); void SchedulePendingFlush(ColumnFamilyData* cfd); void SchedulePendingCompaction(ColumnFamilyData* cfd); - static void BGWorkCompaction(void* db); + void SchedulePendingPurge(std::string fname, FileType type, uint64_t number, + uint32_t path_id, int job_id); + static void BGWorkCompaction(void* arg); static void BGWorkFlush(void* db); - void BackgroundCallCompaction(); + static void BGWorkPurge(void* arg); + static void UnscheduleCallback(void* arg); + void BackgroundCallCompaction(void* arg); void BackgroundCallFlush(); + void BackgroundCallPurge(); Status BackgroundCompaction(bool* madeProgress, JobContext* job_context, - LogBuffer* log_buffer); + LogBuffer* log_buffer, void* m = 0); Status BackgroundFlush(bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer); @@ -521,25 +703,34 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); + const Snapshot* GetSnapshotImpl(bool is_write_conflict_boundary); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; + // The mutex for options file related operations. + // NOTE: should never acquire options_file_mutex_ and mutex_ at the + // same time. + InstrumentedMutex options_files_mutex_; // State below is protected by mutex_ InstrumentedMutex mutex_; + std::atomic shutting_down_; // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 - // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't + // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't // made any progress // * whenever a compaction made any progress - // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is - // done, even if it didn't make any progress) - // * whenever there is an error in background flush or compaction + // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases + // (i.e. whenever a flush is done, even if it didn't make any progress) + // * whenever there is an error in background purge, flush or compaction InstrumentedCondVar bg_cv_; uint64_t logfile_number_; + std::deque + log_recycle_files; // a list of log files that we can recycle bool log_dir_synced_; bool log_empty_; ColumnFamilyHandleImpl* default_cf_handle_; @@ -628,7 +819,7 @@ class DBImpl : public DB { Directories directories_; - WriteBuffer write_buffer_; + WriteBufferManager* write_buffer_manager_; WriteThread write_thread_; @@ -655,6 +846,19 @@ class DBImpl : public DB { // State is protected with db mutex. std::list pending_outputs_; + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_queue_ + struct PurgeFileInfo { + std::string fname; + FileType type; + uint64_t number; + uint32_t path_id; + int job_id; + PurgeFileInfo(std::string fn, FileType t, uint64_t num, uint32_t pid, + int jid) + : fname(fn), type(t), number(num), path_id(pid), job_id(jid) {} + }; + // flush_queue_ and compaction_queue_ hold column families that we need to // flush and compact, respectively. // A column family is inserted into flush_queue_ when it satisfies condition @@ -679,35 +883,55 @@ class DBImpl : public DB { // invariant(column family present in compaction_queue_ <==> // ColumnFamilyData::pending_compaction_ == true) std::deque compaction_queue_; + + // A queue to store filenames of the files to be purged + std::deque purge_queue_; + + // A queue to store log writers to close + std::deque logs_to_free_queue_; int unscheduled_flushes_; int unscheduled_compactions_; // count how many background compactions are running or have been scheduled int bg_compaction_scheduled_; - // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual - // compactions (if manual_compaction_ is not null). This mechanism enables - // manual compactions to wait until all other compactions are finished. - int bg_manual_only_; + // stores the number of compactions are currently running + int num_running_compactions_; // number of background memtable flush jobs, submitted to the HIGH pool int bg_flush_scheduled_; + // stores the number of flushes are currently running + int num_running_flushes_; + + // number of background obsolete file purge jobs, submitted to the HIGH pool + int bg_purge_scheduled_; + // Information for a manual compaction struct ManualCompaction { ColumnFamilyData* cfd; int input_level; int output_level; uint32_t output_path_id; - bool done; Status status; + bool done; bool in_progress; // compaction request being processed? + bool incomplete; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run const InternalKey* begin; // nullptr means beginning of key range const InternalKey* end; // nullptr means end of key range + InternalKey* manual_end; // how far we are compacting InternalKey tmp_storage; // Used to keep track of compaction progress - bool disallow_trivial_move; // Force actual compaction to run + InternalKey tmp_storage1; // Used to keep track of compaction progress + Compaction* compaction; + }; + std::deque manual_compaction_dequeue_; + + struct CompactionArg { + DBImpl* db; + ManualCompaction* m; }; - ManualCompaction* manual_compaction_; // Have we encountered a background error in paranoid mode? Status bg_error_; @@ -730,7 +954,10 @@ class DBImpl : public DB { // they're unique std::atomic next_job_id_; - bool flush_on_destroy_; // Used when disableWAL is true. + // A flag indicating whether the current rocksdb database has any + // data that is not yet persisted into either WAL or SST file. + // Used when disableWAL is true. + bool has_unpersisted_data_; static const int KEEP_LOG_FILE_NUM = 1000; // MSVC version 1800 still does not have constexpr for ::max() @@ -741,6 +968,10 @@ class DBImpl : public DB { // The options to access storage files const EnvOptions env_options_; + // A set of compactions that are running right now + // REQUIRES: mutex held + std::unordered_set running_compactions_; + #ifndef ROCKSDB_LITE WalManager wal_manager_; #endif // ROCKSDB_LITE @@ -748,15 +979,40 @@ class DBImpl : public DB { // Unified interface for logging events EventLogger event_logger_; - // A value of >0 temporarily disables scheduling of background work + // A value of > 0 temporarily disables scheduling of background work int bg_work_paused_; + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + // Guard against multiple concurrent refitting bool refitting_level_; // Indicate DB was opened successfully bool opened_successfully_; + // minmum log number still containing prepared data. + // this is used by FindObsoleteFiles to determine which + // flushed logs we must keep around because they still + // contain prepared data which has not been flushed or rolled back + std::priority_queue, std::greater> + min_log_with_prep_; + + // to be used in conjunction with min_log_with_prep_. + // once a transaction with data in log L is committed or rolled back + // rather than removing the value from the heap we add that value + // to prepared_section_completed_ which maps LOG -> instance_count + // since a log could contain multiple prepared sections + // + // when trying to determine the minmum log still active we first + // consult min_log_with_prep_. while that root value maps to + // a value > 0 in prepared_section_completed_ we decrement the + // instance_count for that log and pop the root value in + // min_log_with_prep_. This will work the same as a min_heap + // where we are deleteing arbitrary elements and the up heaping. + std::unordered_map prepared_section_completed_; + std::mutex prep_heap_mutex_; + // No copying allowed DBImpl(const DBImpl&); void operator=(const DBImpl&); @@ -788,6 +1044,10 @@ class DBImpl : public DB { virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + #endif // ROCKSDB_LITE // Function that Get and KeyMayExist call with no_io true or false @@ -796,9 +1056,17 @@ class DBImpl : public DB { const Slice& key, std::string* value, bool* value_found = nullptr); - bool GetIntPropertyInternal(ColumnFamilyHandle* column_family, - DBPropertyType property_type, - bool need_out_of_mutex, uint64_t* value); + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompaction* m); + void RemoveManualCompaction(ManualCompaction* m); + bool ShouldntRunManualCompaction(ManualCompaction* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompaction* m, ManualCompaction* m1); }; // Sanitize db options. The caller should delete result.info_log if diff --git a/external/rocksdb/db/db_impl_add_file.cc b/external/rocksdb/db/db_impl_add_file.cc new file mode 100644 index 0000000000..3019f33913 --- /dev/null +++ b/external/rocksdb/db/db_impl_add_file.cc @@ -0,0 +1,419 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/db_impl.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include + +#include "db/builder.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/sst_file_writer.h" +#include "table/table_builder.h" +#include "util/file_reader_writer.h" +#include "util/file_util.h" +#include "util/sync_point.h" + +namespace rocksdb { + +namespace { +// RAII Timer +class StatsTimer { + public: + explicit StatsTimer(Env* env, uint64_t* counter) + : env_(env), counter_(counter), start_micros_(env_->NowMicros()) {} + ~StatsTimer() { *counter_ += env_->NowMicros() - start_micros_; } + + private: + Env* env_; + uint64_t* counter_; + uint64_t start_micros_; +}; +} // anonymous namespace + +#ifndef ROCKSDB_LITE + +Status DBImpl::ReadExternalSstFileInfo(ColumnFamilyHandle* column_family, + const std::string& file_path, + ExternalSstFileInfo* file_info) { + Status status; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + file_info->file_path = file_path; + status = env_->GetFileSize(file_path, &file_info->file_size); + if (!status.ok()) { + return status; + } + + // Access the file using TableReader to extract + // version, number of entries, smallest user key, largest user key + std::unique_ptr sst_file; + status = env_->NewRandomAccessFile(file_path, &sst_file, env_options_); + if (!status.ok()) { + return status; + } + std::unique_ptr sst_file_reader; + sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file))); + + std::unique_ptr table_reader; + status = cfd->ioptions()->table_factory->NewTableReader( + TableReaderOptions(*cfd->ioptions(), env_options_, + cfd->internal_comparator()), + std::move(sst_file_reader), file_info->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + // Get the external sst file version from table properties + const UserCollectedProperties& user_collected_properties = + table_reader->GetTableProperties()->user_collected_properties; + UserCollectedProperties::const_iterator external_sst_file_version_iter = + user_collected_properties.find(ExternalSstFilePropertyNames::kVersion); + if (external_sst_file_version_iter == user_collected_properties.end()) { + return Status::InvalidArgument("Generated table version not found"); + } + + file_info->version = + DecodeFixed32(external_sst_file_version_iter->second.c_str()); + if (file_info->version == 1) { + // version 1 imply that all sequence numbers in table equal 0 + file_info->sequence_number = 0; + } else { + return Status::InvalidArgument("Generated table version is not supported"); + } + // Get number of entries in table + file_info->num_entries = table_reader->GetTableProperties()->num_entries; + + ParsedInternalKey key; + std::unique_ptr iter( + table_reader->NewIterator(ReadOptions())); + + // Get first (smallest) key from file + iter->SeekToFirst(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("Generated table have corrupted keys"); + } + if (key.sequence != 0) { + return Status::Corruption("Generated table have non zero sequence number"); + } + file_info->smallest_key = key.user_key.ToString(); + + // Get last (largest) key from file + iter->SeekToLast(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("Generated table have corrupted keys"); + } + if (key.sequence != 0) { + return Status::Corruption("Generated table have non zero sequence number"); + } + file_info->largest_key = key.user_key.ToString(); + + return Status::OK(); +} + +Status DBImpl::AddFile(ColumnFamilyHandle* column_family, + const std::vector& file_path_list, + bool move_file) { + Status status; + auto num_files = file_path_list.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } + + std::vector file_infos(num_files); + std::vector file_info_list(num_files); + for (size_t i = 0; i < num_files; i++) { + status = ReadExternalSstFileInfo(column_family, file_path_list[i], + &file_info_list[i]); + if (!status.ok()) { + return status; + } + } + return AddFile(column_family, file_info_list, move_file); +} + +Status DBImpl::AddFile(ColumnFamilyHandle* column_family, + const std::vector& file_info_list, + bool move_file) { + Status status; + auto cfh = reinterpret_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + auto num_files = file_info_list.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } + + // Verify that passed files dont have overlapping ranges + if (num_files > 1) { + std::vector sorted_file_info_list(num_files); + for (size_t i = 0; i < num_files; i++) { + sorted_file_info_list[i] = &file_info_list[i]; + } + + auto* vstorage = cfd->current()->storage_info(); + std::sort( + sorted_file_info_list.begin(), sorted_file_info_list.end(), + [&vstorage, &file_info_list](const ExternalSstFileInfo* info1, + const ExternalSstFileInfo* info2) { + return vstorage->InternalComparator()->user_comparator()->Compare( + info1->smallest_key, info2->smallest_key) < 0; + }); + + for (size_t i = 0; i < num_files - 1; i++) { + if (sorted_file_info_list[i]->largest_key >= + sorted_file_info_list[i + 1]->smallest_key) { + return Status::NotSupported("Cannot add overlapping range among files"); + } + } + } + + std::vector micro_list(num_files, 0); + std::vector meta_list(num_files); + for (size_t i = 0; i < num_files; i++) { + StatsTimer t(env_, µ_list[i]); + if (file_info_list[i].num_entries == 0) { + return Status::InvalidArgument("File contain no entries"); + } + if (file_info_list[i].version != 1) { + return Status::InvalidArgument( + "Generated table version is not supported"); + } + // version 1 imply that file have only Put Operations with Sequence Number = + // 0 + + meta_list[i].smallest = + InternalKey(file_info_list[i].smallest_key, + file_info_list[i].sequence_number, ValueType::kTypeValue); + meta_list[i].largest = + InternalKey(file_info_list[i].largest_key, + file_info_list[i].sequence_number, ValueType::kTypeValue); + if (!meta_list[i].smallest.Valid() || !meta_list[i].largest.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); + } + meta_list[i].smallest_seqno = file_info_list[i].sequence_number; + meta_list[i].largest_seqno = file_info_list[i].sequence_number; + if (meta_list[i].smallest_seqno != 0 || meta_list[i].largest_seqno != 0) { + return Status::InvalidArgument( + "Non zero sequence numbers are not supported"); + } + } + + std::vector::iterator> pending_outputs_inserted_elem_list( + num_files); + // Generate locations for the new tables + { + InstrumentedMutexLock l(&mutex_); + for (size_t i = 0; i < num_files; i++) { + StatsTimer t(env_, µ_list[i]); + pending_outputs_inserted_elem_list[i] = + CaptureCurrentFileNumberInPendingOutputs(); + meta_list[i].fd = FileDescriptor(versions_->NewFileNumber(), 0, + file_info_list[i].file_size); + } + } + + // Copy/Move external files into DB + std::vector db_fname_list(num_files); + size_t j = 0; + for (; j < num_files; j++) { + StatsTimer t(env_, µ_list[j]); + db_fname_list[j] = + TableFileName(db_options_.db_paths, meta_list[j].fd.GetNumber(), + meta_list[j].fd.GetPathId()); + if (move_file) { + status = env_->LinkFile(file_info_list[j].file_path, db_fname_list[j]); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard linking + status = + CopyFile(env_, file_info_list[j].file_path, db_fname_list[j], 0); + } + } else { + status = CopyFile(env_, file_info_list[j].file_path, db_fname_list[j], 0); + } + TEST_SYNC_POINT("DBImpl::AddFile:FileCopied"); + if (!status.ok()) { + for (size_t i = 0; i < j; i++) { + Status s = env_->DeleteFile(db_fname_list[i]); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + db_fname_list[i].c_str(), s.ToString().c_str()); + } + } + return status; + } + } + + { + InstrumentedMutexLock l(&mutex_); + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + + if (!snapshots_.empty()) { + // Check that no snapshots are being held + status = + Status::NotSupported("Cannot add a file while holding snapshots"); + } + + if (status.ok()) { + // Verify that added file key range dont overlap with any keys in DB + SuperVersion* sv = cfd->GetSuperVersion()->Ref(); + Arena arena; + ReadOptions ro; + ro.total_order_seek = true; + ScopedArenaIterator iter(NewInternalIterator(ro, cfd, sv, &arena)); + + for (size_t i = 0; i < num_files; i++) { + StatsTimer t(env_, µ_list[i]); + InternalKey range_start(file_info_list[i].smallest_key, + kMaxSequenceNumber, kTypeValue); + iter->Seek(range_start.Encode()); + status = iter->status(); + + if (status.ok() && iter->Valid()) { + ParsedInternalKey seek_result; + if (ParseInternalKey(iter->key(), &seek_result)) { + auto* vstorage = cfd->current()->storage_info(); + if (vstorage->InternalComparator()->user_comparator()->Compare( + seek_result.user_key, file_info_list[i].largest_key) <= 0) { + status = Status::NotSupported("Cannot add overlapping range"); + break; + } + } else { + status = Status::Corruption("DB have corrupted keys"); + break; + } + } + } + } + + // The level the file will be ingested into + std::vector target_level_list(num_files, 0); + if (status.ok()) { + // Add files to L0 + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (size_t i = 0; i < num_files; i++) { + StatsTimer t(env_, µ_list[i]); + // Add file to the lowest possible level + target_level_list[i] = PickLevelForIngestedFile(cfd, file_info_list[i]); + edit.AddFile(target_level_list[i], meta_list[i].fd.GetNumber(), + meta_list[i].fd.GetPathId(), meta_list[i].fd.GetFileSize(), + meta_list[i].smallest, meta_list[i].largest, + meta_list[i].smallest_seqno, meta_list[i].largest_seqno, + meta_list[i].marked_for_compaction); + } + status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, + directories_.GetDbDir()); + } + write_thread_.ExitUnbatched(&w); + + if (status.ok()) { + delete InstallSuperVersionAndScheduleWork(cfd, nullptr, + mutable_cf_options); + } + for (size_t i = 0; i < num_files; i++) { + // Update internal stats + InternalStats::CompactionStats stats(1); + stats.micros = micro_list[i]; + stats.bytes_written = meta_list[i].fd.GetFileSize(); + stats.num_output_files = 1; + cfd->internal_stats()->AddCompactionStats(target_level_list[i], stats); + cfd->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE, + meta_list[i].fd.GetFileSize()); + ReleaseFileNumberFromPendingOutputs( + pending_outputs_inserted_elem_list[i]); + } + } + + if (!status.ok()) { + // We failed to add the files to the database + for (size_t i = 0; i < num_files; i++) { + Status s = env_->DeleteFile(db_fname_list[i]); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + db_fname_list[i].c_str(), s.ToString().c_str()); + } + } + } else if (status.ok() && move_file) { + // The files were moved and added successfully, remove original file links + for (size_t i = 0; i < num_files; i++) { + Status s = env_->DeleteFile(file_info_list[i].file_path); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file " + "link : %s", + file_info_list[i].file_path.c_str(), s.ToString().c_str()); + } + } + } + return status; +} + +// Finds the lowest level in the DB that the ingested file can be added to +int DBImpl::PickLevelForIngestedFile(ColumnFamilyData* cfd, + const ExternalSstFileInfo& file_info) { + mutex_.AssertHeld(); + + int target_level = 0; + auto* vstorage = cfd->current()->storage_info(); + auto* ucmp = vstorage->InternalComparator()->user_comparator(); + + Slice file_smallest_user_key(file_info.smallest_key); + Slice file_largest_user_key(file_info.largest_key); + + for (int lvl = cfd->NumberLevels() - 1; lvl >= vstorage->base_level(); + lvl--) { + if (vstorage->OverlapInLevel(lvl, &file_smallest_user_key, + &file_largest_user_key) == false) { + // Make sure that the file dont overlap with the output of any + // compaction running right now + Slice compaction_smallest_user_key; + Slice compaction_largest_user_key; + bool overlap_with_compaction_output = false; + for (Compaction* c : running_compactions_) { + if (c->column_family_data()->GetID() != cfd->GetID() || + c->output_level() != lvl) { + continue; + } + + compaction_smallest_user_key = c->GetSmallestUserKey(); + compaction_largest_user_key = c->GetLargestUserKey(); + + if (ucmp->Compare(file_smallest_user_key, + compaction_largest_user_key) <= 0 && + ucmp->Compare(file_largest_user_key, + compaction_smallest_user_key) >= 0) { + overlap_with_compaction_output = true; + break; + } + } + + if (overlap_with_compaction_output == false) { + // Level lvl is the lowest level that dont have any files with key + // range overlapping with our file key range and no compactions + // planning to add overlapping files in it. + target_level = lvl; + break; + } + } + } + + return target_level; +} +#endif // ROCKSDB_LITE + +} // namespace rocksdb diff --git a/external/rocksdb/db/db_impl_debug.cc b/external/rocksdb/db/db_impl_debug.cc index dc40fefc6e..37a58a307f 100644 --- a/external/rocksdb/db/db_impl_debug.cc +++ b/external/rocksdb/db/db_impl_debug.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE +#ifndef NDEBUG #include "db/db_impl.h" #include "util/thread_status_updater.h" @@ -19,23 +19,6 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() { return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); } -Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena, - ColumnFamilyHandle* column_family) { - ColumnFamilyData* cfd; - if (column_family == nullptr) { - cfd = default_cf_handle_->cfd(); - } else { - auto cfh = reinterpret_cast(column_family); - cfd = cfh->cfd(); - } - - mutex_.Lock(); - SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); - mutex_.Unlock(); - ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version, arena); -} - int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; @@ -87,14 +70,21 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin, cfd->ioptions()->compaction_style == kCompactionStyleFIFO) ? level : level + 1; - return RunManualCompaction(cfd, level, output_level, 0, begin, end, + return RunManualCompaction(cfd, level, output_level, 0, begin, end, true, disallow_trivial_move); } -Status DBImpl::TEST_FlushMemTable(bool wait) { +Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) { FlushOptions fo; fo.wait = wait; - return FlushMemTable(default_cf_handle_->cfd(), fo); + ColumnFamilyData* cfd; + if (cfh == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfhi = reinterpret_cast(cfh); + cfd = cfhi->cfd(); + } + return FlushMemTable(cfd, fo); } Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { @@ -152,5 +142,41 @@ uint64_t DBImpl::TEST_LogfileNumber() { return logfile_number_; } +Status DBImpl::TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map) { + std::vector cf_names; + std::vector iopts; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_names.push_back(cfd->GetName()); + iopts.push_back(cfd->ioptions()); + } + } + iopts_map->clear(); + for (size_t i = 0; i < cf_names.size(); ++i) { + iopts_map->insert({cf_names[i], iopts[i]}); + } + + return Status::OK(); +} + +uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { + return FindMinLogContainingOutstandingPrep(); +} + +uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { + return FindMinPrepLogReferencedByMemTable(); +} + +Status DBImpl::TEST_GetLatestMutableCFOptions( + ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) { + InstrumentedMutexLock l(&mutex_); + + auto cfh = reinterpret_cast(column_family); + *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions(); + return Status::OK(); +} + } // namespace rocksdb -#endif // ROCKSDB_LITE +#endif // NDEBUG diff --git a/external/rocksdb/db/db_impl_experimental.cc b/external/rocksdb/db/db_impl_experimental.cc index 6bf0ba6a14..90e034cd0c 100644 --- a/external/rocksdb/db/db_impl_experimental.cc +++ b/external/rocksdb/db/db_impl_experimental.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -49,8 +49,7 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, } // Since we have some more files to compact, we should also recompute // compaction score - vstorage->ComputeCompactionScore(*cfd->GetLatestMutableCFOptions(), - CompactionOptionsFIFO()); + vstorage->ComputeCompactionScore(*cfd->GetLatestMutableCFOptions()); SchedulePendingCompaction(cfd); MaybeScheduleFlushOrCompaction(); } diff --git a/external/rocksdb/db/db_impl_readonly.cc b/external/rocksdb/db/db_impl_readonly.cc index 618ade8c97..57c14df149 100644 --- a/external/rocksdb/db/db_impl_readonly.cc +++ b/external/rocksdb/db/db_impl_readonly.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -54,10 +54,11 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, auto db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), (read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ + ? reinterpret_cast(read_options.snapshot) + ->number_ : latest_snapshot), - super_version->mutable_cf_options.max_sequential_skip_in_iterations); + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number); auto internal_iter = NewInternalIterator( read_options, cfd, super_version, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); @@ -81,10 +82,11 @@ Status DBImplReadOnly::NewIterators( auto* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), (read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ - : latest_snapshot), - sv->mutable_cf_options.max_sequential_skip_in_iterations); + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot), + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number); auto* internal_iter = NewInternalIterator( read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); diff --git a/external/rocksdb/db/db_impl_readonly.h b/external/rocksdb/db/db_impl_readonly.h index 8f3103aaca..a410a4e32b 100644 --- a/external/rocksdb/db/db_impl_readonly.h +++ b/external/rocksdb/db/db_impl_readonly.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/util/db_info_dumper.cc b/external/rocksdb/db/db_info_dumper.cc similarity index 97% rename from external/rocksdb/util/db_info_dumper.cc rename to external/rocksdb/db/db_info_dumper.cc index 6cb978fbb1..56cf3e288f 100644 --- a/external/rocksdb/util/db_info_dumper.cc +++ b/external/rocksdb/db/db_info_dumper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,6 +7,8 @@ #define __STDC_FORMAT_MACROS #endif +#include "db/db_info_dumper.h" + #include #include #include @@ -16,7 +18,6 @@ #include "db/filename.h" #include "rocksdb/options.h" #include "rocksdb/env.h" -#include "util/db_info_dumper.h" namespace rocksdb { diff --git a/external/rocksdb/util/db_info_dumper.h b/external/rocksdb/db/db_info_dumper.h similarity index 85% rename from external/rocksdb/util/db_info_dumper.h rename to external/rocksdb/db/db_info_dumper.h index ed0a63ded8..470b6224f2 100644 --- a/external/rocksdb/util/db_info_dumper.h +++ b/external/rocksdb/db/db_info_dumper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/db_inplace_update_test.cc b/external/rocksdb/db/db_inplace_update_test.cc index a04c2f50c6..2acc25700d 100644 --- a/external/rocksdb/db/db_inplace_update_test.cc +++ b/external/rocksdb/db/db_inplace_update_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,8 +6,8 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" namespace rocksdb { @@ -18,12 +18,11 @@ class DBTestInPlaceUpdate : public DBTestBase { TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) { do { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.inplace_update_support = true; options.env = env_; options.write_buffer_size = 100000; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of smaller size @@ -41,12 +40,11 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) { TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) { do { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.inplace_update_support = true; options.env = env_; options.write_buffer_size = 100000; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of larger size @@ -64,7 +62,7 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) { TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) { do { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.inplace_update_support = true; @@ -72,7 +70,6 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTestInPlaceUpdate::updateInPlaceSmallerSize; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of smaller size @@ -92,7 +89,7 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) { TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) { do { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.inplace_update_support = true; @@ -100,7 +97,6 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of smaller varint size @@ -120,7 +116,7 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) { TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) { do { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.inplace_update_support = true; @@ -128,7 +124,6 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTestInPlaceUpdate::updateInPlaceLargerSize; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of larger size @@ -146,7 +141,7 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) { TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) { do { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.inplace_update_support = true; @@ -154,7 +149,6 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTestInPlaceUpdate::updateInPlaceNoAction; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Callback function requests no actions from db diff --git a/external/rocksdb/db/db_io_failure_test.cc b/external/rocksdb/db/db_io_failure_test.cc new file mode 100644 index 0000000000..4d66e1d1a3 --- /dev/null +++ b/external/rocksdb/db/db_io_failure_test.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace rocksdb { + +class DBIOFailureTest : public DBTestBase { + public: + DBIOFailureTest() : DBTestBase("/db_io_failure_test") {} +}; + +#ifndef ROCKSDB_LITE +// Check that number of files does not grow when writes are dropped +TEST_F(DBIOFailureTest, DropWrites) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.paranoid_checks = false; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const size_t num_files = CountFiles(); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + env_->sleep_counter_.Reset(); + env_->no_sleep_ = true; + for (int i = 0; i < 5; i++) { + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + for (int level = 0; level < dbfull()->NumberLevels(); level++) { + if (level > 0 && level == dbfull()->NumberLevels() - 1) { + break; + } + dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + } + } else { + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + } + + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); + + env_->drop_writes_.store(false, std::memory_order_release); + ASSERT_LT(CountFiles(), num_files + 3); + + // Check that compaction attempts slept after errors + // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler + // versions + ASSERT_GE(env_->sleep_counter_.Read(), 4); + } while (ChangeCompactOptions()); +} + +// Check background error counter bumped on flush failures. +TEST_F(DBIOFailureTest, DropWritesFlush) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.max_background_flushes = 1; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); + + dbfull()->TEST_FlushMemTable(true); + + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("1", property_value); + + env_->drop_writes_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} +#endif // ROCKSDB_LITE + +// Check that CompactRange() returns failure if there is not enough space left +// on device +TEST_F(DBIOFailureTest, NoSpaceCompactRange) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + // generate 5 tables + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i), Key(i) + "v")); + ASSERT_OK(Flush()); + } + + // Force out-of-space errors + env_->no_space_.store(true, std::memory_order_release); + + Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.IsIOError()); + + env_->no_space_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} + +TEST_F(DBIOFailureTest, NonWritableFileSystem) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writeable_rate_.store(100); + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writeable_rate_.store(0); + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIOFailureTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record + + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + std::atomic* error_type = (iter == 0) ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; + + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Memtable compaction (will succeed) + Flush(); + ASSERT_EQ("bar", Get("foo")); + const int last = 2; + MoveFilesToLevel(2); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + error_type->store(false, std::memory_order_release); + + // Since paranoid_checks=true, writes should fail + ASSERT_NOK(Put("foo2", "bar2")); + + // Recovery: should not lose data + ASSERT_EQ("bar", Get("foo")); + + // Try again with paranoid_checks=false + Close(); + options.paranoid_checks = false; + Reopen(options); + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + // Recovery: should not lose data + error_type->store(false, std::memory_order_release); + Reopen(options); + ASSERT_EQ("bar", Get("foo")); + + // Since paranoid_checks=false, writes should succeed + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("bar2", Get("foo2")); + } +} +#endif // ROCKSDB_LITE + +TEST_F(DBIOFailureTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.store(false, std::memory_order_release); + s = Put(1, "foo3", "bar3"); + // the next put should fail, too + ASSERT_TRUE(!s.ok()); + // but we're still able to read + ASSERT_EQ("bar", Get(1, "foo")); + + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.store(false, std::memory_order_release); + s = Put(1, "foo3", "bar3"); + // the next put should NOT fail + ASSERT_TRUE(s.ok()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_iter.cc b/external/rocksdb/db/db_iter.cc index 065b8e4fc8..5de3b406b9 100644 --- a/external/rocksdb/db/db_iter.cc +++ b/external/rocksdb/db/db_iter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,17 +13,22 @@ #include #include -#include "db/filename.h" #include "db/dbformat.h" +#include "db/filename.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "port/port.h" #include "rocksdb/env.h" -#include "rocksdb/options.h" #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" -#include "port/port.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" #include "util/arena.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/perf_context_imp.h" +#include "util/string_util.h" namespace rocksdb { @@ -58,37 +63,89 @@ class DBIter: public Iterator { kReverse }; - DBIter(Env* env, const ImmutableCFOptions& ioptions, - const Comparator* cmp, Iterator* iter, SequenceNumber s, - bool arena_mode, uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr) + // LocalStatistics contain Statistics counters that will be aggregated per + // each iterator instance and then will be sent to the global statistics when + // the iterator is destroyed. + // + // The purpose of this approach is to avoid perf regression happening + // when multiple threads bump the atomic counters from a DBIter::Next(). + struct LocalStatistics { + explicit LocalStatistics() { ResetCounters(); } + + void ResetCounters() { + next_count_ = 0; + next_found_count_ = 0; + prev_count_ = 0; + prev_found_count_ = 0; + bytes_read_ = 0; + } + + void BumpGlobalStatistics(Statistics* global_statistics) { + RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); + RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); + RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); + RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); + RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); + ResetCounters(); + } + + // Map to Tickers::NUMBER_DB_NEXT + uint64_t next_count_; + // Map to Tickers::NUMBER_DB_NEXT_FOUND + uint64_t next_found_count_; + // Map to Tickers::NUMBER_DB_PREV + uint64_t prev_count_; + // Map to Tickers::NUMBER_DB_PREV_FOUND + uint64_t prev_found_count_; + // Map to Tickers::ITER_BYTES_READ + uint64_t bytes_read_; + }; + + DBIter(Env* env, const ImmutableCFOptions& ioptions, const Comparator* cmp, + InternalIterator* iter, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + const Slice* iterate_upper_bound = nullptr, + bool prefix_same_as_start = false, bool pin_data = false) : arena_mode_(arena_mode), env_(env), logger_(ioptions.info_log), user_comparator_(cmp), - user_merge_operator_(ioptions.merge_operator), + merge_operator_(ioptions.merge_operator), iter_(iter), sequence_(s), direction_(kForward), valid_(false), current_entry_is_merged_(false), statistics_(ioptions.statistics), - iterate_upper_bound_(iterate_upper_bound) { + version_number_(version_number), + iterate_upper_bound_(iterate_upper_bound), + prefix_same_as_start_(prefix_same_as_start), + pin_thru_lifetime_(pin_data) { RecordTick(statistics_, NO_ITERATORS); prefix_extractor_ = ioptions.prefix_extractor; max_skip_ = max_sequential_skip_in_iterations; + if (pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + if (iter_) { + iter_->SetPinnedItersMgr(&pinned_iters_mgr_); + } } virtual ~DBIter() { + // Release pinned data if any + pinned_iters_mgr_.ReleasePinnedIterators(); RecordTick(statistics_, NO_ITERATORS, -1); + local_stats_.BumpGlobalStatistics(statistics_); if (!arena_mode_) { delete iter_; } else { - iter_->~Iterator(); + iter_->~InternalIterator(); } } - virtual void SetIter(Iterator* iter) { + virtual void SetIter(InternalIterator* iter) { assert(iter_ == nullptr); iter_ = iter; + iter_->SetPinnedItersMgr(&pinned_iters_mgr_); } virtual bool Valid() const override { return valid_; } virtual Slice key() const override { @@ -97,8 +154,15 @@ class DBIter: public Iterator { } virtual Slice value() const override { assert(valid_); - return (direction_ == kForward && !current_entry_is_merged_) ? - iter_->value() : saved_value_; + if (current_entry_is_merged_) { + // If pinned_value_ is set then the result of merge operator is one of + // the merge operands and we should return it. + return pinned_value_.data() ? pinned_value_ : saved_value_; + } else if (direction_ == kReverse) { + return pinned_value_; + } else { + return iter_->value(); + } } virtual Status status() const override { if (status_.ok()) { @@ -108,6 +172,28 @@ class DBIter: public Iterator { } } + virtual Status GetProperty(std::string prop_name, + std::string* prop) override { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + if (!iter_->GetProperty(prop_name, prop).ok()) { + *prop = ToString(version_number_); + } + return Status::OK(); + } else if (prop_name == "rocksdb.iterator.is-key-pinned") { + if (valid_) { + *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0"; + } else { + *prop = "Iterator is not valid."; + } + return Status::OK(); + } + return Status::InvalidArgument("Undentified property."); + } + virtual void Next() override; virtual void Prev() override; virtual void Seek(const Slice& target) override; @@ -122,11 +208,26 @@ class DBIter: public Iterator { bool FindValueForCurrentKeyUsingSeek(); void FindPrevUserKey(); void FindNextUserKey(); - inline void FindNextUserEntry(bool skipping); - void FindNextUserEntryInternal(bool skipping); + inline void FindNextUserEntry(bool skipping, bool prefix_check); + void FindNextUserEntryInternal(bool skipping, bool prefix_check); bool ParseKey(ParsedInternalKey* key); void MergeValuesNewToOld(); + // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() + // is called + void TempPinData() { + if (!pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + } + + // Release blocks pinned by TempPinData() + void ReleaseTempPinnedData() { + if (!pin_thru_lifetime_) { + pinned_iters_mgr_.ReleasePinnedIterators(); + } + } + inline void ClearSavedValue() { if (saved_value_.capacity() > 1048576) { std::string empty; @@ -141,19 +242,31 @@ class DBIter: public Iterator { Env* const env_; Logger* logger_; const Comparator* const user_comparator_; - const MergeOperator* const user_merge_operator_; - Iterator* iter_; + const MergeOperator* const merge_operator_; + InternalIterator* iter_; SequenceNumber const sequence_; Status status_; IterKey saved_key_; std::string saved_value_; + Slice pinned_value_; Direction direction_; bool valid_; bool current_entry_is_merged_; Statistics* statistics_; uint64_t max_skip_; + uint64_t version_number_; const Slice* iterate_upper_bound_; + IterKey prefix_start_buf_; + Slice prefix_start_key_; + const bool prefix_same_as_start_; + // Means that we will pin all data blocks we read as long the Iterator + // is not deleted, will be true if ReadOptions::pin_data is true + const bool pin_thru_lifetime_; + // List of operands for merge operator. + MergeContext merge_context_; + LocalStatistics local_stats_; + PinnedIteratorsManager pinned_iters_mgr_; // No copying allowed DBIter(const DBIter&); @@ -175,26 +288,37 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { void DBIter::Next() { assert(valid_); + // Release temporarily pinned blocks from last operation + ReleaseTempPinnedData(); if (direction_ == kReverse) { FindNextUserKey(); direction_ = kForward; if (!iter_->Valid()) { iter_->SeekToFirst(); } + } else if (iter_->Valid() && !current_entry_is_merged_) { + // If the current value is not a merge, the iter position is the + // current key, which is already returned. We can safely issue a + // Next() without checking the current key. + // If the current key is a merge, very likely iter already points + // to the next internal position. + iter_->Next(); + PERF_COUNTER_ADD(internal_key_skipped_count, 1); } - // If the current value is merged, we might already hit end of iter_ + if (statistics_ != nullptr) { + local_stats_.next_count_++; + } + // Now we point to the next internal position, for both of merge and + // not merge cases. if (!iter_->Valid()) { valid_ = false; return; } - FindNextUserEntry(true /* skipping the current user key */); - if (statistics_ != nullptr) { - RecordTick(statistics_, NUMBER_DB_NEXT); - if (valid_) { - RecordTick(statistics_, NUMBER_DB_NEXT_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); - } + FindNextUserEntry(true /* skipping the current user key */, prefix_same_as_start_); + if (statistics_ != nullptr && valid_) { + local_stats_.next_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); } } @@ -206,13 +330,18 @@ void DBIter::Next() { // // NOTE: In between, saved_key_ can point to a user key that has // a delete marker -inline void DBIter::FindNextUserEntry(bool skipping) { +// +// The prefix_check parameter controls whether we check the iterated +// keys against the prefix of the seeked key. Set to false when +// performing a seek without a key (e.g. SeekToFirst). Set to +// prefix_same_as_start_ for other iterations. +inline void DBIter::FindNextUserEntry(bool skipping, bool prefix_check) { PERF_TIMER_GUARD(find_next_user_entry_time); - FindNextUserEntryInternal(skipping); + FindNextUserEntryInternal(skipping, prefix_check); } // Actual implementation of DBIter::FindNextUserEntry() -void DBIter::FindNextUserEntryInternal(bool skipping) { +void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { // Loop until we hit an acceptable entry to yield assert(iter_->Valid()); assert(direction_ == kForward); @@ -223,7 +352,12 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { if (ParseKey(&ikey)) { if (iterate_upper_bound_ != nullptr && - ikey.user_key.compare(*iterate_upper_bound_) >= 0) { + user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) { + break; + } + + if (prefix_extractor_ && prefix_check && + prefix_extractor_->Transform(ikey.user_key).compare(prefix_start_key_) != 0) { break; } @@ -238,18 +372,24 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { case kTypeSingleDeletion: // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. - saved_key_.SetKey(ikey.user_key); + saved_key_.SetKey( + ikey.user_key, + !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); skipping = true; num_skipped = 0; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeValue: valid_ = true; - saved_key_.SetKey(ikey.user_key); + saved_key_.SetKey( + ikey.user_key, + !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); return; case kTypeMerge: // By now, we are sure the current ikey is going to yield a value - saved_key_.SetKey(ikey.user_key); + saved_key_.SetKey( + ikey.user_key, + !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); current_entry_is_merged_ = true; valid_ = true; MergeValuesNewToOld(); // Go to a different state machine @@ -287,17 +427,20 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { // POST: saved_value_ has the merged value for the user key // iter_ points to the next entry (or invalid) void DBIter::MergeValuesNewToOld() { - if (!user_merge_operator_) { + if (!merge_operator_) { Log(InfoLogLevel::ERROR_LEVEL, logger_, "Options::merge_operator is null."); - status_ = Status::InvalidArgument("user_merge_operator_ must be set."); + status_ = Status::InvalidArgument("merge_operator_ must be set."); valid_ = false; return; } + // Temporarily pin the blocks that hold merge operands + TempPinData(); + merge_context_.Clear(); // Start the merge process by pushing the first operand - std::deque operands; - operands.push_front(iter_->value().ToString()); + merge_context_.PushOperand(iter_->value(), + iter_->IsValuePinned() /* operand_pinned */); ParsedInternalKey ikey; for (iter_->Next(); iter_->Valid(); iter_->Next()) { @@ -319,53 +462,50 @@ void DBIter::MergeValuesNewToOld() { // final result in saved_value_. We are done! // ignore corruption if there is any. const Slice val = iter_->value(); - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - user_merge_operator_->FullMerge(ikey.user_key, &val, operands, - &saved_value_, logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanos()); - } + MergeHelper::TimedFullMerge(merge_operator_, ikey.user_key, &val, + merge_context_.GetOperands(), &saved_value_, + logger_, statistics_, env_, &pinned_value_); // iter_ is positioned after put iter_->Next(); return; } else if (kTypeMerge == ikey.type) { // hit a merge, add the value as an operand and run associative merge. // when complete, add result to operands and continue. - const Slice& val = iter_->value(); - operands.push_front(val.ToString()); + merge_context_.PushOperand(iter_->value(), + iter_->IsValuePinned() /* operand_pinned */); } else { assert(false); } } - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - // we either exhausted all internal keys under this user key, or hit - // a deletion marker. - // feed null as the existing value to the merge operator, such that - // client can differentiate this scenario and do things accordingly. - user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands, - &saved_value_, logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanos()); - } + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetKey(), nullptr, + merge_context_.GetOperands(), &saved_value_, + logger_, statistics_, env_, &pinned_value_); } void DBIter::Prev() { assert(valid_); + ReleaseTempPinnedData(); if (direction_ == kForward) { ReverseToBackward(); } PrevInternal(); if (statistics_ != nullptr) { - RecordTick(statistics_, NUMBER_DB_PREV); + local_stats_.prev_count_++; if (valid_) { - RecordTick(statistics_, NUMBER_DB_PREV_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + local_stats_.prev_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); } } + if (valid_ && prefix_extractor_ && prefix_same_as_start_ && + prefix_extractor_->Transform(saved_key_.GetKey()) + .compare(prefix_start_key_) != 0) { + valid_ = false; + } } void DBIter::ReverseToBackward() { @@ -404,7 +544,8 @@ void DBIter::PrevInternal() { ParsedInternalKey ikey; while (iter_->Valid()) { - saved_key_.SetKey(ExtractUserKey(iter_->key())); + saved_key_.SetKey(ExtractUserKey(iter_->key()), + !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); if (FindValueForCurrentKey()) { valid_ = true; if (!iter_->Valid()) { @@ -434,8 +575,8 @@ void DBIter::PrevInternal() { // saved_value_ bool DBIter::FindValueForCurrentKey() { assert(iter_->Valid()); - // Contains operands for merge operator. - std::deque operands; + merge_context_.Clear(); + current_entry_is_merged_ = false; // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or // kTypeValue) ValueType last_not_merge_type = kTypeDeletion; @@ -444,6 +585,9 @@ bool DBIter::FindValueForCurrentKey() { ParsedInternalKey ikey; FindParseableKey(&ikey, kReverse); + // Temporarily pin blocks that hold (merge operands / the value) + ReleaseTempPinnedData(); + TempPinData(); size_t num_skipped = 0; while (iter_->Valid() && ikey.sequence <= sequence_ && user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) { @@ -455,19 +599,21 @@ bool DBIter::FindValueForCurrentKey() { last_key_entry_type = ikey.type; switch (last_key_entry_type) { case kTypeValue: - operands.clear(); - saved_value_ = iter_->value().ToString(); + merge_context_.Clear(); + assert(iter_->IsValuePinned()); + pinned_value_ = iter_->value(); last_not_merge_type = kTypeValue; break; case kTypeDeletion: case kTypeSingleDeletion: - operands.clear(); + merge_context_.Clear(); last_not_merge_type = last_key_entry_type; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: - assert(user_merge_operator_ != nullptr); - operands.push_back(iter_->value().ToString()); + assert(merge_operator_ != nullptr); + merge_context_.PushOperandBack( + iter_->value(), iter_->IsValuePinned() /* operand_pinned */); break; default: assert(false); @@ -486,25 +632,18 @@ bool DBIter::FindValueForCurrentKey() { valid_ = false; return false; case kTypeMerge: + current_entry_is_merged_ = true; if (last_not_merge_type == kTypeDeletion) { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands, - &saved_value_, logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanos()); + MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetKey(), + nullptr, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, env_, + &pinned_value_); } else { assert(last_not_merge_type == kTypeValue); - std::string last_put_value = saved_value_; - Slice temp_slice(last_put_value); - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - user_merge_operator_->FullMerge(saved_key_.GetKey(), &temp_slice, - operands, &saved_value_, logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanos()); - } + MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetKey(), + &pinned_value_, + merge_context_.GetOperands(), &saved_value_, + logger_, statistics_, env_, &pinned_value_); } break; case kTypeValue: @@ -521,6 +660,9 @@ bool DBIter::FindValueForCurrentKey() { // This function is used in FindValueForCurrentKey. // We use Seek() function instead of Prev() to find necessary value bool DBIter::FindValueForCurrentKeyUsingSeek() { + // FindValueForCurrentKey will enable pinning before calling + // FindValueForCurrentKeyUsingSeek() + assert(pinned_iters_mgr_.PinningEnabled()); std::string last_key; AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), sequence_, kValueTypeForSeek)); @@ -534,7 +676,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (ikey.type == kTypeValue || ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) { if (ikey.type == kTypeValue) { - saved_value_ = iter_->value().ToString(); + assert(iter_->IsValuePinned()); + pinned_value_ = iter_->value(); valid_ = true; return true; } @@ -544,11 +687,13 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { // kTypeMerge. We need to collect all kTypeMerge values and save them // in operands - std::deque operands; + current_entry_is_merged_ = true; + merge_context_.Clear(); while (iter_->Valid() && user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) && ikey.type == kTypeMerge) { - operands.push_front(iter_->value().ToString()); + merge_context_.PushOperand(iter_->value(), + iter_->IsValuePinned() /* operand_pinned */); iter_->Next(); FindParseableKey(&ikey, kForward); } @@ -556,13 +701,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (!iter_->Valid() || !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) || ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) { - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands, - &saved_value_, logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanos()); - } + MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetKey(), nullptr, + merge_context_.GetOperands(), &saved_value_, + logger_, statistics_, env_, &pinned_value_); // Make iter_ valid and point to saved_key_ if (!iter_->Valid() || !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) { @@ -574,13 +715,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } const Slice& val = iter_->value(); - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - user_merge_operator_->FullMerge(saved_key_.GetKey(), &val, operands, - &saved_value_, logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanos()); - } + MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetKey(), &val, + merge_context_.GetOperands(), &saved_value_, + logger_, statistics_, env_, &pinned_value_); valid_ = true; return true; } @@ -644,6 +781,7 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) { void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); + ReleaseTempPinnedData(); saved_key_.Clear(); // now savved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); @@ -655,9 +793,15 @@ void DBIter::Seek(const Slice& target) { RecordTick(statistics_, NUMBER_DB_SEEK); if (iter_->Valid()) { + if (prefix_extractor_ && prefix_same_as_start_) { + prefix_start_key_ = prefix_extractor_->Transform(target); + } direction_ = kForward; ClearSavedValue(); - FindNextUserEntry(false /* not skipping */); + FindNextUserEntry(false /* not skipping */, prefix_same_as_start_); + if (!valid_) { + prefix_start_key_.clear(); + } if (statistics_ != nullptr) { if (valid_) { RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); @@ -667,6 +811,10 @@ void DBIter::Seek(const Slice& target) { } else { valid_ = false; } + if (valid_ && prefix_extractor_ && prefix_same_as_start_) { + prefix_start_buf_.SetKey(prefix_start_key_); + prefix_start_key_ = prefix_start_buf_.GetKey(); + } } void DBIter::SeekToFirst() { @@ -676,6 +824,7 @@ void DBIter::SeekToFirst() { max_skip_ = std::numeric_limits::max(); } direction_ = kForward; + ReleaseTempPinnedData(); ClearSavedValue(); { @@ -685,7 +834,7 @@ void DBIter::SeekToFirst() { RecordTick(statistics_, NUMBER_DB_SEEK); if (iter_->Valid()) { - FindNextUserEntry(false /* not skipping */); + FindNextUserEntry(false /* not skipping */, false /* no prefix check */); if (statistics_ != nullptr) { if (valid_) { RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); @@ -695,6 +844,10 @@ void DBIter::SeekToFirst() { } else { valid_ = false; } + if (valid_ && prefix_extractor_ && prefix_same_as_start_) { + prefix_start_buf_.SetKey(prefix_extractor_->Transform(saved_key_.GetKey())); + prefix_start_key_ = prefix_start_buf_.GetKey(); + } } void DBIter::SeekToLast() { @@ -704,6 +857,7 @@ void DBIter::SeekToLast() { max_skip_ = std::numeric_limits::max(); } direction_ = kReverse; + ReleaseTempPinnedData(); ClearSavedValue(); { @@ -714,7 +868,7 @@ void DBIter::SeekToLast() { // it will seek to the last key before the // ReadOptions.iterate_upper_bound if (iter_->Valid() && iterate_upper_bound_ != nullptr) { - saved_key_.SetKey(*iterate_upper_bound_); + saved_key_.SetKey(*iterate_upper_bound_, false /* copy */); std::string last_key; AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), kMaxSequenceNumber, @@ -740,24 +894,32 @@ void DBIter::SeekToLast() { RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); } } + if (valid_ && prefix_extractor_ && prefix_same_as_start_) { + prefix_start_buf_.SetKey(prefix_extractor_->Transform(saved_key_.GetKey())); + prefix_start_key_ = prefix_start_buf_.GetKey(); + } } Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions, const Comparator* user_key_comparator, - Iterator* internal_iter, + InternalIterator* internal_iter, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound) { - return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, - false, max_sequential_skip_in_iterations, - iterate_upper_bound); + uint64_t version_number, + const Slice* iterate_upper_bound, + bool prefix_same_as_start, bool pin_data) { + DBIter* db_iter = + new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, + false, max_sequential_skip_in_iterations, version_number, + iterate_upper_bound, prefix_same_as_start, pin_data); + return db_iter; } ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; } -void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) { +void ArenaWrappedDBIter::SetIterUnderDBIter(InternalIterator* iter) { static_cast(db_iter_)->SetIter(iter); } @@ -772,6 +934,10 @@ inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); } inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); } inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); } inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); } +inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name, + std::string* prop) { + return db_iter_->GetProperty(prop_name, prop); +} void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, void* arg2) { db_iter_->RegisterCleanup(function, arg1, arg2); @@ -779,16 +945,17 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const ImmutableCFOptions& ioptions, - const Comparator* user_key_comparator, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound) { + const Comparator* user_key_comparator, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + const Slice* iterate_upper_bound, bool prefix_same_as_start, + bool pin_data) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); Arena* arena = iter->GetArena(); auto mem = arena->AllocateAligned(sizeof(DBIter)); - DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator, - nullptr, sequence, true, max_sequential_skip_in_iterations, - iterate_upper_bound); + DBIter* db_iter = + new (mem) DBIter(env, ioptions, user_key_comparator, nullptr, sequence, + true, max_sequential_skip_in_iterations, version_number, + iterate_upper_bound, prefix_same_as_start, pin_data); iter->SetDBIter(db_iter); diff --git a/external/rocksdb/db/db_iter.h b/external/rocksdb/db/db_iter.h index c676d6cda1..89db7ad384 100644 --- a/external/rocksdb/db/db_iter.h +++ b/external/rocksdb/db/db_iter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,7 +9,9 @@ #pragma once #include +#include #include "rocksdb/db.h" +#include "rocksdb/iterator.h" #include "db/dbformat.h" #include "util/arena.h" #include "util/autovector.h" @@ -18,18 +20,17 @@ namespace rocksdb { class Arena; class DBIter; +class InternalIterator; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number // into appropriate user keys. extern Iterator* NewDBIterator( - Env* env, - const ImmutableCFOptions& options, - const Comparator *user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr); + Env* env, const ImmutableCFOptions& options, + const Comparator* user_key_comparator, InternalIterator* internal_iter, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, const Slice* iterate_upper_bound = nullptr, + bool prefix_same_as_start = false, bool pin_data = false); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -50,7 +51,7 @@ class ArenaWrappedDBIter : public Iterator { // Set the internal iterator wrapped inside the DB Iterator. Usually it is // a merging iterator. - virtual void SetIterUnderDBIter(Iterator* iter); + virtual void SetIterUnderDBIter(InternalIterator* iter); virtual bool Valid() const override; virtual void SeekToFirst() override; virtual void SeekToLast() override; @@ -60,7 +61,9 @@ class ArenaWrappedDBIter : public Iterator { virtual Slice key() const override; virtual Slice value() const override; virtual Status status() const override; + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + virtual Status GetProperty(std::string prop_name, std::string* prop) override; private: DBIter* db_iter_; @@ -70,8 +73,9 @@ class ArenaWrappedDBIter : public Iterator { // Generate the arena wrapped iterator class. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const ImmutableCFOptions& options, - const Comparator* user_key_comparator, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr); + const Comparator* user_key_comparator, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + const Slice* iterate_upper_bound = nullptr, + bool prefix_same_as_start = false, bool pin_data = false); } // namespace rocksdb diff --git a/external/rocksdb/db/db_iter_test.cc b/external/rocksdb/db/db_iter_test.cc index 68c5b158de..30956e35c7 100644 --- a/external/rocksdb/db/db_iter_test.cc +++ b/external/rocksdb/db/db_iter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -29,7 +29,7 @@ static uint64_t TestGetTickerCount(const Options& options, return options.statistics->getTickerCount(ticker_type); } -class TestIterator : public Iterator { +class TestIterator : public InternalIterator { public: explicit TestIterator(const Comparator* comparator) : initialized_(false), @@ -150,6 +150,9 @@ class TestIterator : public Iterator { return Status::OK(); } + virtual bool IsKeyPinned() const override { return true; } + virtual bool IsValuePinned() const override { return true; } + private: bool initialized_; bool valid_; @@ -181,10 +184,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -215,7 +217,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -241,7 +243,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -273,7 +276,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -308,7 +312,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -337,7 +342,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 7, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 7, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -374,7 +380,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 4, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -399,7 +406,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -421,7 +429,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -456,7 +465,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 7, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 7, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -482,10 +492,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -524,10 +533,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -555,10 +563,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("c", "val_c"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -583,10 +590,9 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -595,10 +601,9 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); } @@ -617,10 +622,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { } internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -659,9 +663,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { options.statistics = rocksdb::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -695,9 +698,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -724,9 +726,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 202, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, 202, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -756,10 +757,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -773,10 +773,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 200, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 200, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -809,9 +808,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -844,9 +842,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -887,7 +884,7 @@ TEST_F(DBIteratorTest, DBIterator1) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -913,7 +910,7 @@ TEST_F(DBIteratorTest, DBIterator2) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -936,7 +933,7 @@ TEST_F(DBIteratorTest, DBIterator3) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -958,7 +955,7 @@ TEST_F(DBIteratorTest, DBIterator4) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -987,7 +984,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 0, options.max_sequential_skip_in_iterations)); + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1009,7 +1006,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 1, options.max_sequential_skip_in_iterations)); + 1, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1031,7 +1028,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 2, options.max_sequential_skip_in_iterations)); + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1053,7 +1050,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 3, options.max_sequential_skip_in_iterations)); + 3, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1075,7 +1072,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations)); + 4, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1097,7 +1094,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 5, options.max_sequential_skip_in_iterations)); + 5, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1119,7 +1116,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 6, options.max_sequential_skip_in_iterations)); + 6, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1145,7 +1142,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 0, options.max_sequential_skip_in_iterations)); + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1167,7 +1164,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 1, options.max_sequential_skip_in_iterations)); + 1, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1189,7 +1186,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 2, options.max_sequential_skip_in_iterations)); + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1211,7 +1208,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 3, options.max_sequential_skip_in_iterations)); + 3, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -1229,7 +1226,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations)); + 4, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1251,7 +1248,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 5, options.max_sequential_skip_in_iterations)); + 5, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1273,7 +1270,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 6, options.max_sequential_skip_in_iterations)); + 6, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1311,7 +1308,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 0, options.max_sequential_skip_in_iterations)); + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1345,7 +1342,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 2, options.max_sequential_skip_in_iterations)); + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1385,7 +1382,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations)); + 4, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1425,7 +1422,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 5, options.max_sequential_skip_in_iterations)); + 5, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1470,7 +1467,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 6, options.max_sequential_skip_in_iterations)); + 6, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1516,7 +1513,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 7, options.max_sequential_skip_in_iterations)); + 7, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1556,7 +1553,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 9, options.max_sequential_skip_in_iterations)); + 9, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1602,7 +1599,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 13, options.max_sequential_skip_in_iterations)); + 13, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1649,7 +1646,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 14, options.max_sequential_skip_in_iterations)); + 14, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1678,7 +1675,7 @@ TEST_F(DBIteratorTest, DBIterator8) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -1707,7 +1704,7 @@ TEST_F(DBIteratorTest, DBIterator9) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1754,7 +1751,7 @@ TEST_F(DBIteratorTest, DBIterator10) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->Seek("c"); ASSERT_TRUE(db_iter->Valid()); @@ -1778,9 +1775,9 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { internal_iter->AddPut("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, 0 /* force seek */)); + std::unique_ptr db_iter( + NewDBIterator(env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, 10, 0 /* force seek */, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1807,7 +1804,7 @@ TEST_F(DBIteratorTest, DBIterator11) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1832,7 +1829,7 @@ TEST_F(DBIteratorTest, DBIterator12) { std::unique_ptr db_iter( NewDBIterator(env_, ImmutableCFOptions(options), BytewiseComparator(), - internal_iter, 10, 0)); + internal_iter, 10, 0, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -1864,16 +1861,17 @@ class DBIterWithMergeIterTest : public testing::Test { internal_iter2_->Add("d", kTypeValue, "7", 3u); internal_iter2_->Finish(); - std::vector child_iters; + std::vector child_iters; child_iters.push_back(internal_iter1_); child_iters.push_back(internal_iter2_); InternalKeyComparator icomp(BytewiseComparator()); - Iterator* merge_iter = NewMergingIterator(&icomp_, &child_iters[0], 2u); + InternalIterator* merge_iter = + NewMergingIterator(&icomp_, &child_iters[0], 2u); db_iter_.reset(NewDBIterator(env_, ImmutableCFOptions(options_), BytewiseComparator(), merge_iter, 8 /* read data earlier than seqId 8 */, - 3 /* max iterators before reseek */)); + 3 /* max iterators before reseek */, 0)); } Env* env_; @@ -1942,8 +1940,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) { ASSERT_EQ(db_iter_->value().ToString(), "4"); } -#if !(defined NDEBUG) || !defined(OS_WIN) - TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) { // Test Prev() when one child iterator is at its end but more rows // are added. @@ -2294,7 +2290,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // #if !(defined NDEBUG) || !defined(OS_WIN) } // namespace rocksdb int main(int argc, char** argv) { diff --git a/external/rocksdb/db/db_iterator_test.cc b/external/rocksdb/db/db_iterator_test.cc new file mode 100644 index 0000000000..a971835c02 --- /dev/null +++ b/external/rocksdb/db/db_iterator_test.cc @@ -0,0 +1,1606 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/perf_context.h" +#include "port/port.h" + +namespace rocksdb { + +class DBIteratorTest : public DBTestBase { + public: + DBIteratorTest() : DBTestBase("/db_iterator_test") {} +}; + +TEST_F(DBIteratorTest, IteratorProperty) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "1", "2"); + ReadOptions ropt; + ropt.pin_data = false; + { + unique_ptr iter(db_->NewIterator(ropt, handles_[1])); + iter->SeekToFirst(); + std::string prop_value; + ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value)); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("0", prop_value); + iter->Next(); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("Iterator is not valid.", prop_value); + } + Close(); +} + +TEST_F(DBIteratorTest, PersistedTierOnIterator) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + + auto* iter = db_->NewIterator(ropt, handles_[1]); + ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; + + std::vector iters; + ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported()); + Close(); +} + +TEST_F(DBIteratorTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + CreateAndReopenWithCF({"pikachu"}, options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + // Exclude kHashCuckoo as it does not support iteration currently + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | + kSkipMmapReads)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIteratorTest, ManagedNonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + non_blocking_opts.managed = true; + CreateAndReopenWithCF({"pikachu"}, options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + // Exclude kHashCuckoo as it does not support iteration currently + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | + kSkipMmapReads)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBIteratorTest, IterSeekBeforePrev) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter->Prev(); + iter->Seek(Slice("a")); + iter->Prev(); + delete iter; +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // namespace + +TEST_F(DBIteratorTest, IterLongKeys) { + ASSERT_OK(Put(MakeLongKey(20, 0), "0")); + ASSERT_OK(Put(MakeLongKey(32, 2), "2")); + ASSERT_OK(Put("a", "b")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put(MakeLongKey(50, 1), "1")); + ASSERT_OK(Put(MakeLongKey(127, 3), "3")); + ASSERT_OK(Put(MakeLongKey(64, 4), "4")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + iter->Seek(MakeLongKey(20, 0)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + delete iter; + + iter = db_->NewIterator(ReadOptions()); + iter->Seek(MakeLongKey(50, 1)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + delete iter; +} + +TEST_F(DBIteratorTest, IterNextWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("a")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + delete iter; +} + +TEST_F(DBIteratorTest, IterPrevWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST_F(DBIteratorTest, IterPrevWithNewerSeq2) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + ASSERT_EQ(IterStatus(iter), "c->d"); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST_F(DBIteratorTest, IterEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, IterSingle) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, IterMulti) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put(1, "a", "va2")); + ASSERT_OK(Put(1, "a2", "va3")); + ASSERT_OK(Put(1, "b", "vb2")); + ASSERT_OK(Put(1, "c", "vc2")); + ASSERT_OK(Delete(1, "b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST_F(DBIteratorTest, IterReseek) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // insert three keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put(1, "a", "zero")); + ASSERT_OK(Put(1, "a", "one")); + ASSERT_OK(Put(1, "a", "two")); + ASSERT_OK(Put(1, "b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put(1, "a", "three")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put(1, "a", "four")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = static_cast( + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put(1, "b", "btwo")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put(1, "b", "bthree")); + ASSERT_OK(Put(1, "b", "bfour")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST_F(DBIteratorTest, IterSmallAndLargeMix) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); + ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, IterMultiWithDelete) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "ka", "va")); + ASSERT_OK(Put(1, "kb", "vb")); + ASSERT_OK(Put(1, "kc", "vc")); + ASSERT_OK(Delete(1, "kb")); + ASSERT_EQ("NOT_FOUND", Get(1, "kb")); + + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->Seek("kc"); + ASSERT_EQ(IterStatus(iter), "kc->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "ka->va"); + } + } + delete iter; + } while (ChangeOptions()); +} + +TEST_F(DBIteratorTest, IterPrevMaxSkip) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(1, "key1", "v1")); + ASSERT_OK(Put(1, "key2", "v2")); + ASSERT_OK(Put(1, "key3", "v3")); + ASSERT_OK(Put(1, "key4", "v4")); + ASSERT_OK(Put(1, "key5", "v5")); + } + + VerifyIterLast("key5->v5", 1); + + ASSERT_OK(Delete(1, "key5")); + VerifyIterLast("key4->v4", 1); + + ASSERT_OK(Delete(1, "key4")); + VerifyIterLast("key3->v3", 1); + + ASSERT_OK(Delete(1, "key3")); + VerifyIterLast("key2->v2", 1); + + ASSERT_OK(Delete(1, "key2")); + VerifyIterLast("key1->v1", 1); + + ASSERT_OK(Delete(1, "key1")); + VerifyIterLast("(invalid)", 1); + } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); +} + +TEST_F(DBIteratorTest, IterWithSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(1, "key2", "val2")); + ASSERT_OK(Put(1, "key3", "val3")); + ASSERT_OK(Put(1, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = db_->NewIterator(options, handles_[1]); + + // Put more values after the snapshot + ASSERT_OK(Put(1, "key100", "val100")); + ASSERT_OK(Put(1, "key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + } + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + db_->ReleaseSnapshot(snapshot); + delete iter; + // skip as HashCuckooRep does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST_F(DBIteratorTest, IteratorPinsRef) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Put(1, "foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + // Write to force compactions + Put(1, "foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + // 100K values + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); + } + Put(1, "foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_F(DBIteratorTest, DBIteratorBoundTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + // Testing SeekToLast with iterate_upper_bound set + { + ReadOptions ro; + + Slice prefix("foo"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("a")), 0); + } + + // prefix is the first letter of the key + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice upper_bound("g"); + ro.iterate_upper_bound = &upper_bound; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo1", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + perf_context.Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(db_->NewIterator(ro)); + + perf_context.Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); + } +} + +// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary +// return the biggest key which is smaller than the seek key. +TEST_F(DBIteratorTest, PrevAfterMerge) { + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + // write three entries with different keys using Merge() + WriteOptions wopts; + db_->Merge(wopts, "1", "data1"); + db_->Merge(wopts, "2", "data2"); + db_->Merge(wopts, "3", "data3"); + + std::unique_ptr it(db_->NewIterator(ReadOptions())); + + it->Seek("2"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); + + it->Prev(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); +} + +TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) { + enum TestConfig { + NORMAL, + CLOSE_AND_OPEN, + COMPACT_BEFORE_READ, + FLUSH_EVERY_1000, + MAX + }; + + // Generate Random data + Random rnd(301); + + int puts = 100000; + int key_pool = static_cast(puts * 0.7); + int key_size = 100; + int val_size = 1000; + int seeks_percentage = 20; // 20% of keys will be used to test seek() + int delete_percentage = 20; // 20% of keys will be deleted + int merge_percentage = 20; // 20% of keys will be added using Merge() + + for (int run_config = 0; run_config < TestConfig::MAX; run_config++) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + std::vector generated_keys(key_pool); + for (int i = 0; i < key_pool; i++) { + generated_keys[i] = RandomString(&rnd, key_size); + } + + std::map true_data; + std::vector random_keys; + std::vector deleted_keys; + for (int i = 0; i < puts; i++) { + auto& k = generated_keys[rnd.Next() % key_pool]; + auto v = RandomString(&rnd, val_size); + + // Insert data to true_data map and to DB + true_data[k] = v; + if (rnd.OneIn(static_cast(100.0 / merge_percentage))) { + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + } else { + ASSERT_OK(Put(k, v)); + } + + // Pick random keys to be used to test Seek() + if (rnd.OneIn(static_cast(100.0 / seeks_percentage))) { + random_keys.push_back(k); + } + + // Delete some random keys + if (rnd.OneIn(static_cast(100.0 / delete_percentage))) { + deleted_keys.push_back(k); + true_data.erase(k); + ASSERT_OK(Delete(k)); + } + + if (run_config == TestConfig::FLUSH_EVERY_1000) { + if (i && i % 1000 == 0) { + Flush(); + } + } + } + + if (run_config == TestConfig::CLOSE_AND_OPEN) { + Close(); + Reopen(options); + } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + { + // Test Seek to random keys + printf("Testing seek on %" ROCKSDB_PRIszt " keys\n", random_keys.size()); + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->Seek(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.lower_bound(k), true_data.end()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back(true_data.lower_bound(k)->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + + { + // Test iterating all data forward + printf("Testing iterating forward on all keys\n"); + std::vector all_keys; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid + auto data_iter = true_data.begin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + { + // Test iterating all data backward + printf("Testing iterating backward on all keys\n"); + std::vector all_keys; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid (backward) + auto data_iter = true_data.rbegin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + delete iter; + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIteratorTest, PinnedDataIteratorMultipleFiles) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb + DestroyAndReopen(options); + + std::map true_data; + + // Generate 4 sst files in L2 + Random rnd(301); + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 3); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(FilesPerLevel(0), "0,4"); + + // Generate 4 sst files in L0 + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 2); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + // Add some keys/values in memtables + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), true_data.size()); + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} +#endif + +TEST_F(DBIteratorTest, PinnedDataIteratorMergeOperator) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + DestroyAndReopen(options); + + std::string numbers[7]; + for (int val = 0; val <= 6; val++) { + PutFixed64(numbers + val, val); + } + + // +1 all keys in range [ 0 => 999] + for (int i = 0; i < 1000; i++) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[1])); + } + + // +2 all keys divisible by 2 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 2) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[2])); + } + + // +3 all keys divisible by 5 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 5) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[3])); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), 1000); + for (size_t i = 0; i < results.size(); i++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, Key(static_cast(i))); + int expected_val = 1; + if (i % 2 == 0) { + expected_val += 2; + } + if (i % 5 == 0) { + expected_val += 3; + } + ASSERT_EQ(kv.second, numbers[expected_val]); + } + + delete iter; +} + +TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.write_buffer_size = 100000; + DestroyAndReopen(options); + + Random rnd(301); + + std::map true_data; + for (int i = 0; i < 1000; i++) { + std::string k = RandomString(&rnd, 10); + std::string v = RandomString(&rnd, 1000); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = db_->NewIterator(ro); + + // Delete 50% of the keys and update the other 50% + for (auto& kv : true_data) { + if (rnd.OneIn(2)) { + ASSERT_OK(Delete(kv.first)); + } else { + std::string new_val = RandomString(&rnd, 1000); + ASSERT_OK(Put(kv.first, new_val)); + } + } + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} + +TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocks) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; // every block will contain one entry + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + options.max_sequential_skip_in_iterations = 8; + + DestroyAndReopen(options); + + // Putting such deletes will force DBIter::Prev() to fallback to a Seek + for (int file_num = 0; file_num < 10; file_num++) { + ASSERT_OK(Delete("key4")); + ASSERT_OK(Flush()); + } + + // First File containing 5 blocks of puts + ASSERT_OK(Put("key1", "val1.0")); + ASSERT_OK(Put("key2", "val2.0")); + ASSERT_OK(Put("key3", "val3.0")); + ASSERT_OK(Put("key4", "val4.0")); + ASSERT_OK(Put("key5", "val5.0")); + ASSERT_OK(Flush()); + + // Second file containing 9 blocks of merge operands + ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2")); + + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3")); + + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4")); + ASSERT_OK(Flush()); + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = db_->NewIterator(ro); + + iter->SeekToLast(); + ASSERT_EQ(iter->key().ToString(), "key5"); + ASSERT_EQ(iter->value().ToString(), "val5.0"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key4"); + ASSERT_EQ(iter->value().ToString(), "val4.0"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key3"); + ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key2"); + ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key1"); + ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2"); + + delete iter; + } +} + +TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.max_sequential_skip_in_iterations = 8; + DestroyAndReopen(options); + + const int kNumKeys = 500; + // Small number of merge operands to make sure that DBIter::Prev() dont + // fall back to Seek() + const int kNumMergeOperands = 3; + // Use value size that will make sure that every block contain 1 key + const int kValSize = + static_cast(BlockBasedTableOptions().block_size) * 4; + // Percentage of keys that wont get merge operations + const int kNoMergeOpPercentage = 20; + // Percentage of keys that will be deleted + const int kDeletePercentage = 10; + + // For half of the key range we will write multiple deletes first to + // force DBIter::Prev() to fall back to Seek() + for (int file_num = 0; file_num < 10; file_num++) { + for (int i = 0; i < kNumKeys; i += 2) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + } + + Random rnd(301); + std::map true_data; + std::string gen_key; + std::string gen_val; + + for (int i = 0; i < kNumKeys; i++) { + gen_key = Key(i); + gen_val = RandomString(&rnd, kValSize); + + ASSERT_OK(Put(gen_key, gen_val)); + true_data[gen_key] = gen_val; + } + ASSERT_OK(Flush()); + + // Separate values and merge operands in different file so that we + // make sure that we dont merge them while flushing but actually + // merge them in the read path + for (int i = 0; i < kNumKeys; i++) { + if (rnd.OneIn(static_cast(100.0 / kNoMergeOpPercentage))) { + // Dont give merge operations for some keys + continue; + } + + for (int j = 0; j < kNumMergeOperands; j++) { + gen_key = Key(i); + gen_val = RandomString(&rnd, kValSize); + + ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val)); + true_data[gen_key] += "," + gen_val; + } + } + ASSERT_OK(Flush()); + + for (int i = 0; i < kNumKeys; i++) { + if (rnd.OneIn(static_cast(100.0 / kDeletePercentage))) { + gen_key = Key(i); + + ASSERT_OK(Delete(gen_key)); + true_data.erase(gen_key); + } + } + ASSERT_OK(Flush()); + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = db_->NewIterator(ro); + auto data_iter = true_data.rbegin(); + + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + data_iter++; + } + ASSERT_EQ(data_iter, true_data.rend()); + + delete iter; + } + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = db_->NewIterator(ro); + auto data_iter = true_data.rbegin(); + + int entries_right = 0; + std::string seek_key; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + // Verify key/value of current position + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + + bool restore_position_with_seek = rnd.Uniform(2); + if (restore_position_with_seek) { + seek_key = iter->key().ToString(); + } + + // Do some Next() operations the restore the iterator to orignal position + int next_count = + entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0; + for (int i = 0; i < next_count; i++) { + iter->Next(); + data_iter--; + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + + if (restore_position_with_seek) { + // Restore orignal position using Seek() + iter->Seek(seek_key); + for (int i = 0; i < next_count; i++) { + data_iter++; + } + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } else { + // Restore original position using Prev() + for (int i = 0; i < next_count; i++) { + iter->Prev(); + data_iter++; + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + } + + entries_right++; + data_iter++; + } + ASSERT_EQ(data_iter, true_data.rend()); + + delete iter; + } +} + +TEST_F(DBIteratorTest, IteratorWithLocalStatistics) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 1000; i++) { + // Key 10 bytes / Value 10 bytes + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + + std::atomic total_next(0); + std::atomic total_next_found(0); + std::atomic total_prev(0); + std::atomic total_prev_found(0); + std::atomic total_bytes(0); + + std::vector threads; + std::function reader_func_next = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + // Seek will bump ITER_BYTES_READ + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + while (true) { + iter->Next(); + total_next++; + + if (!iter->Valid()) { + break; + } + total_next_found++; + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + } + + delete iter; + }; + + std::function reader_func_prev = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToLast(); + // Seek will bump ITER_BYTES_READ + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + while (true) { + iter->Prev(); + total_prev++; + + if (!iter->Valid()) { + break; + } + total_prev_found++; + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + } + + delete iter; + }; + + for (int i = 0; i < 10; i++) { + threads.emplace_back(reader_func_next); + } + for (int i = 0; i < 15; i++) { + threads.emplace_back(reader_func_prev); + } + + for (auto& t : threads) { + t.join(); + } + + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), total_next); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND), + total_next_found); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), total_prev); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), + total_prev_found); + ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), total_bytes); +} + +TEST_F(DBIteratorTest, ReadAhead) { + Options options; + env_->count_random_reads_ = true; + options.env = env_; + options.disable_auto_compactions = true; + options.write_buffer_size = 4 << 20; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + table_options.no_block_cache = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + + std::string value(1024, 'a'); + for (int i = 0; i < 100; i++) { + Put(Key(i), value); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + for (int i = 0; i < 100; i++) { + Put(Key(i), value); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + for (int i = 0; i < 100; i++) { + Put(Key(i), value); + } + ASSERT_OK(Flush()); +#ifndef ROCKSDB_LITE + ASSERT_EQ("1,1,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + env_->random_read_bytes_counter_ = 0; + options.statistics->setTickerCount(NO_FILE_OPENS, 0); + ReadOptions read_options; + auto* iter = db_->NewIterator(read_options); + iter->SeekToFirst(); + int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS); + size_t bytes_read = env_->random_read_bytes_counter_; + delete iter; + + env_->random_read_bytes_counter_ = 0; + options.statistics->setTickerCount(NO_FILE_OPENS, 0); + read_options.readahead_size = 1024 * 10; + iter = db_->NewIterator(read_options); + iter->SeekToFirst(); + int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS); + size_t bytes_read_readahead = env_->random_read_bytes_counter_; + delete iter; + ASSERT_EQ(num_file_opens + 3, num_file_opens_readahead); + ASSERT_GT(bytes_read_readahead, bytes_read); + ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3); + + // Verify correctness. + iter = db_->NewIterator(read_options); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(value, iter->value()); + count++; + } + ASSERT_EQ(100, count); + for (int i = 0; i < 100; i++) { + iter->Seek(Key(i)); + ASSERT_EQ(value, iter->value()); + } + delete iter; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_log_iter_test.cc b/external/rocksdb/db/db_log_iter_test.cc index a1e8d2012e..956f601a7a 100644 --- a/external/rocksdb/db/db_log_iter_test.cc +++ b/external/rocksdb/db/db_log_iter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,10 +10,10 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) +#include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" namespace rocksdb { @@ -27,7 +27,7 @@ class DBTestXactLogIterator : public DBTestBase { Status status = dbfull()->GetUpdatesSince(seq, &iter); EXPECT_OK(status); EXPECT_TRUE(iter->Valid()); - return std::move(iter); + return iter; } }; @@ -277,10 +277,10 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { } } // namespace rocksdb -#endif // !(defined NDEBUG) || !defined(OS_WIN) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/external/rocksdb/db/db_options_test.cc b/external/rocksdb/db/db_options_test.cc new file mode 100644 index 0000000000..0d484d79f3 --- /dev/null +++ b/external/rocksdb/db/db_options_test.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "util/sync_point.h" + +namespace rocksdb { + +class DBOptionsTest : public DBTestBase { + public: + DBOptionsTest() : DBTestBase("/db_options_test") {} +}; + +// RocksDB lite don't support dynamic options. +#ifndef ROCKSDB_LITE + +TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) { + const std::string kValue(1024, 'v'); + for (int method_type = 0; method_type < 2; method_type++) { + for (int option_type = 0; option_type < 4; option_type++) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024; + options.compression = CompressionType::kNoCompression; + options.level0_file_num_compaction_trigger = 1; + options.level0_stop_writes_trigger = std::numeric_limits::max(); + options.level0_slowdown_writes_trigger = std::numeric_limits::max(); + options.hard_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.soft_pending_compaction_bytes_limit = + std::numeric_limits::max(); + + DestroyAndReopen(options); + for (int i = 0; i < 1024 * 2; i++) { + Put(Key(i), kValue); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + uint64_t l0_size = SizeAtLevel(0); + + switch (option_type) { + case 0: + // test with level0_stop_writes_trigger + options.level0_stop_writes_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + break; + case 1: + options.level0_slowdown_writes_trigger = 2; + break; + case 2: + options.hard_pending_compaction_bytes_limit = l0_size; + options.soft_pending_compaction_bytes_limit = l0_size; + break; + case 3: + options.soft_pending_compaction_bytes_limit = l0_size; + break; + } + Reopen(options); + dbfull()->TEST_WaitForCompact(); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction():BeforePickCompaction", + "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"}, + {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3", + "DBImpl::BackgroundCompaction():AfterPickCompaction"}}); + // Block background compaction. + SyncPoint::GetInstance()->EnableProcessing(); + + switch (method_type) { + case 0: + ASSERT_OK( + dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + break; + case 1: + ASSERT_OK(dbfull()->EnableAutoCompaction( + {dbfull()->DefaultColumnFamily()})); + break; + } + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1"); + // Wait for stall condition recalculate. + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"); + + switch (option_type) { + case 0: + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + break; + case 1: + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + break; + case 2: + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + break; + case 3: + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + break; + } + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3"); + + // Background compaction executed. + dbfull()->TEST_WaitForCompact(); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + } + } +} + +#endif // ROCKSDB_LITE + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_properties_test.cc b/external/rocksdb/db/db_properties_test.cc new file mode 100644 index 0000000000..d1e0478d41 --- /dev/null +++ b/external/rocksdb/db/db_properties_test.cc @@ -0,0 +1,1278 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/perf_level.h" +#include "rocksdb/table.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace rocksdb { + +class DBPropertiesTest : public DBTestBase { + public: + DBPropertiesTest() : DBTestBase("/db_properties_test") {} +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, Empty) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + std::string num; + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("2", num); + + Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + ASSERT_EQ("v1", Get(1, "foo")); + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("1", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("3", num); + + ASSERT_OK(db_->EnableFileDeletions(false)); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + } while (ChangeOptions()); +} + +TEST_F(DBPropertiesTest, CurrentVersionNumber) { + uint64_t v1, v2, v3; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1)); + Put("12345678", ""); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3)); + + ASSERT_EQ(v1, v2); + ASSERT_GT(v3, v2); +} + +TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) { + const int kKeySize = 100; + const int kValueSize = 500; + const int kKeyNum = 100; + + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10; + // Make them never flush + options.min_write_buffer_number_to_merge = 1000; + options.max_write_buffer_number = 1000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"one", "two", "three", "four"}, options); + + Random rnd(301); + for (auto* handle : handles_) { + for (int i = 0; i < kKeyNum; ++i) { + db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + } + + uint64_t manual_sum = 0; + uint64_t api_sum = 0; + uint64_t value = 0; + for (auto* handle : handles_) { + ASSERT_TRUE( + db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value)); + manual_sum += value; + } + ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, + &api_sum)); + ASSERT_GT(manual_sum, 0); + ASSERT_EQ(manual_sum, api_sum); + + ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value)); + + uint64_t before_flush_trm; + uint64_t after_flush_trm; + for (auto* handle : handles_) { + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &before_flush_trm)); + + // Issue flush and expect larger memory usage of table readers. + db_->Flush(FlushOptions(), handle); + + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &after_flush_trm)); + ASSERT_GT(after_flush_trm, before_flush_trm); + } +} + +namespace { +void ResetTableProperties(TableProperties* tp) { + tp->data_size = 0; + tp->index_size = 0; + tp->filter_size = 0; + tp->raw_key_size = 0; + tp->raw_value_size = 0; + tp->num_data_blocks = 0; + tp->num_entries = 0; +} + +void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { + double dummy_double; + std::replace(tp_string.begin(), tp_string.end(), ';', ' '); + std::replace(tp_string.begin(), tp_string.end(), '=', ' '); + ResetTableProperties(tp); + + sscanf(tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " index block size %" SCNu64 + " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->index_size, &tp->filter_size); +} + +void VerifySimilar(uint64_t a, uint64_t b, double bias) { + ASSERT_EQ(a == 0U, b == 0U); + if (a == 0) { + return; + } + double dbl_a = static_cast(a); + double dbl_b = static_cast(b); + if (dbl_a > dbl_b) { + ASSERT_LT(static_cast(dbl_a - dbl_b) / (dbl_a + dbl_b), bias); + } else { + ASSERT_LT(static_cast(dbl_b - dbl_a) / (dbl_a + dbl_b), bias); + } +} + +void VerifyTableProperties(const TableProperties& base_tp, + const TableProperties& new_tp, + double filter_size_bias = 0.1, + double index_size_bias = 0.1, + double data_size_bias = 0.1, + double num_data_blocks_bias = 0.05) { + VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); + VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); + VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); + VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks, + num_data_blocks_bias); + ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); + ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); + ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); +} + +void GetExpectedTableProperties(TableProperties* expected_tp, + const int kKeySize, const int kValueSize, + const int kKeysPerTable, const int kTableCount, + const int kBloomBitsPerKey, + const size_t kBlockSize) { + const int kKeyCount = kTableCount * kKeysPerTable; + const int kAvgSuccessorSize = kKeySize / 5; + const int kEncodingSavePerKey = kKeySize / 4; + expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); + expected_tp->raw_value_size = kKeyCount * kValueSize; + expected_tp->num_entries = kKeyCount; + expected_tp->num_data_blocks = + kTableCount * + (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / + kBlockSize; + expected_tp->data_size = + kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); + expected_tp->index_size = + expected_tp->num_data_blocks * (kAvgSuccessorSize + 8); + expected_tp->filter_size = + kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); +} +} // anonymous namespace + +TEST_F(DBPropertiesTest, ValidatePropertyInfo) { + for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) { + // If C++ gets a std::string_literal, this would be better to check at + // compile-time using static_assert. + ASSERT_TRUE(ppt_name_and_info.first.empty() || + !isdigit(ppt_name_and_info.first.back())); + + ASSERT_TRUE((ppt_name_and_info.second.handle_string == nullptr) != + (ppt_name_and_info.second.handle_int == nullptr)); + } +} + +TEST_F(DBPropertiesTest, ValidateSampleNumber) { + // When "max_open_files" is -1, we read all the files for + // "rocksdb.estimate-num-keys" computation, which is the ground truth. + // Otherwise, we sample 20 newest files to make an estimation. + // Formula: lastest_20_files_active_key_ratio * total_files + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1000; + DestroyAndReopen(options); + int key = 0; + for (int files = 20; files >= 10; files -= 10) { + for (int i = 0; i < files; i++) { + int rows = files / 10; + for (int j = 0; j < rows; j++) { + db_->Put(WriteOptions(), std::to_string(++key), "foo"); + } + db_->Flush(FlushOptions()); + } + } + std::string num; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ("45", num); + options.max_open_files = -1; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ("50", num); +} + +TEST_F(DBPropertiesTest, AggregatedTableProperties) { + for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { + const int kKeysPerTable = 100; + const int kKeySize = 80; + const int kValueSize = 200; + const int kBloomBitsPerKey = 20; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + Random rnd(5632); + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kKeysPerTable; ++i) { + db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + db_->Flush(FlushOptions()); + } + std::string property; + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + + TableProperties expected_tp; + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, kTableCount, kBloomBitsPerKey, + table_options.block_size); + + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + + VerifyTableProperties(expected_tp, output_tp); + } +} + +TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 6; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 4500 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.statistics = rocksdb::CreateDBStatistics(); + options.max_open_files = 100; + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + + DestroyAndReopen(options); + int key_index = 0; + Random rnd(301); + for (int num = 0; num < 8; num++) { + Put("foo", "bar"); + GenerateNewFile(&rnd, &key_index); + dbfull()->TEST_WaitForCompact(); + } + dbfull()->TEST_WaitForCompact(); + + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + + // Get() after flushes, See latency histogram tracked. + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue Get(). See thee latency tracked + Reopen(options); + dbfull()->TEST_WaitForCompact(); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue iterating. See thee latency tracked + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + { + unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + } + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // options.max_open_files preloads table readers. + options.max_open_files = -1; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); +} + +TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { + const int kTableCount = 100; + const int kKeysPerTable = 10; + const int kKeySize = 50; + const int kValueSize = 400; + const int kMaxLevel = 7; + const int kBloomBitsPerKey = 20; + Random rnd(301); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 8192; + options.max_bytes_for_level_base = 10000; + options.max_bytes_for_level_multiplier = 2; + // This ensures there no compaction happening when we call GetProperty(). + options.disable_auto_compactions = true; + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + std::string level_tp_strings[kMaxLevel]; + std::string tp_string; + TableProperties level_tps[kMaxLevel]; + TableProperties tp, sum_tp, expected_tp; + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kKeysPerTable; ++i) { + db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + db_->Flush(FlushOptions()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ResetTableProperties(&sum_tp); + for (int level = 0; level < kMaxLevel; ++level) { + db_->GetProperty( + DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level), + &level_tp_strings[level]); + ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]); + sum_tp.data_size += level_tps[level].data_size; + sum_tp.index_size += level_tps[level].index_size; + sum_tp.filter_size += level_tps[level].filter_size; + sum_tp.raw_key_size += level_tps[level].raw_key_size; + sum_tp.raw_value_size += level_tps[level].raw_value_size; + sum_tp.num_data_blocks += level_tps[level].num_data_blocks; + sum_tp.num_entries += level_tps[level].num_entries; + } + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); + ParseTablePropertiesString(tp_string, &tp); + ASSERT_EQ(sum_tp.data_size, tp.data_size); + ASSERT_EQ(sum_tp.index_size, tp.index_size); + ASSERT_EQ(sum_tp.filter_size, tp.filter_size); + ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size); + ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); + ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); + ASSERT_EQ(sum_tp.num_entries, tp.num_entries); + if (table > 3) { + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, table, kBloomBitsPerKey, + table_options.block_size); + // Gives larger bias here as index block size, filter block size, + // and data block size become much harder to estimate in this test. + VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); + } + } +} + +TEST_F(DBPropertiesTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_number_to_maintain = 4; + options.write_buffer_size = 1000000; + CreateAndReopenWithCF({"pikachu"}, options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + SetPerfLevel(kEnableTime); + ASSERT_TRUE(GetPerfLevel() == kEnableTime); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(1, static_cast(perf_context.get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "1"); + + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(2, static_cast(perf_context.get_from_memtable_count)); + perf_context.Reset(); + Get(1, "k2"); + ASSERT_EQ(1, static_cast(perf_context.get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "2"); + perf_context.Reset(); + Get(1, "k2"); + ASSERT_EQ(2, static_cast(perf_context.get_from_memtable_count)); + perf_context.Reset(); + Get(1, "k3"); + ASSERT_EQ(1, static_cast(perf_context.get_from_memtable_count)); + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(3, static_cast(perf_context.get_from_memtable_count)); + + ASSERT_OK(Flush(1)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "3"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + // "192" is the size of the metadata of an empty skiplist, this would + // break if we change the default skiplist implementation + ASSERT_EQ(num, "192"); + + uint64_t int_num; + uint64_t base_total_size; + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &base_total_size)); + + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", "")); + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3")); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 3U); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 4U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, base_total_size + 1); + + SetPerfLevel(kDisable); + ASSERT_TRUE(GetPerfLevel() == kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBPropertiesTest, GetProperty) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = 1; + options.compaction_options_universal.size_ratio = 50; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + SetPerfLevel(kEnableTime); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing")); + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "2"); + // Verify the same set of properties through GetIntProperty + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num)); + ASSERT_EQ(int_num, 1U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "4"); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Wait for compaction to be done. This is important because otherwise RocksDB + // might schedule a compaction when reopening the database, failing assertion + // (A) as a result. + dbfull()->TEST_WaitForCompact(); + options.max_open_files = 10; + Reopen(options); + // After reopening, no table reader is loaded, so no memory for table readers + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); // (A) + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_GT(int_num, 0U); + + // After reading a key, at least one table reader is loaded. + Get("k5"); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + // Test rocksdb.num-live-versions + { + options.level0_file_num_compaction_trigger = 20; + Reopen(options); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + + // Use an iterator to hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + // Use an iterator to hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 3U); + + iter2.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + iter1.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + } +} + +TEST_F(DBPropertiesTest, ApproximateMemoryUsage) { + const int kNumRounds = 10; + // TODO(noetzli) kFlushesPerRound does not really correlate with how many + // flushes happen. + const int kFlushesPerRound = 10; + const int kWritesPerFlush = 10; + const int kKeySize = 100; + const int kValueSize = 1000; + Options options; + options.write_buffer_size = 1000; // small write buffer + options.min_write_buffer_number_to_merge = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + + std::vector iters; + + uint64_t active_mem; + uint64_t unflushed_mem; + uint64_t all_mem; + uint64_t prev_all_mem; + + // Phase 0. The verify the initial value of all these properties are the same + // as we have no mem-tables. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(all_mem, active_mem); + ASSERT_EQ(all_mem, unflushed_mem); + + // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to + // "size-all-mem-tables" + for (int r = 0; r < kNumRounds; ++r) { + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + } + } + // Make sure that there is no flush between getting the two properties. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // in no iterator case, these two number should be the same. + ASSERT_EQ(unflushed_mem, all_mem); + } + prev_all_mem = all_mem; + + // Phase 2. Keep issuing Put() but also create new iterators. This time we + // expect "size-all-mem-tables" > "cur-size-all-mem-tables". + for (int r = 0; r < kNumRounds; ++r) { + iters.push_back(db_->NewIterator(ReadOptions())); + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + } + } + // Force flush to prevent flush from happening between getting the + // properties or after getting the properties and before the new round. + Flush(); + + // In the second round, add iterators. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_GT(all_mem, active_mem); + ASSERT_GT(all_mem, unflushed_mem); + ASSERT_GT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks + // whenever we release an iterator. + for (auto* iter : iters) { + delete iter; + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // Expect the size shrinking + ASSERT_LT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Expect all these three counters to be the same. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); + + // Phase 5. Reopen, and expect all these three counters to be the same again. + Reopen(options); + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); +} + +TEST_F(DBPropertiesTest, EstimatePendingCompBytes) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 2; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); +} + +TEST_F(DBPropertiesTest, EstimateCompressionRatio) { + if (!Snappy_Supported()) { + return; + } + const int kNumL0Files = 3; + const int kNumEntriesPerFile = 1000; + + Options options = CurrentOptions(); + options.compression_per_level = {kNoCompression, kSnappyCompression}; + options.disable_auto_compactions = true; + options.max_background_flushes = 0; + options.num_levels = 2; + Reopen(options); + + // compression ratio is -1.0 when no open files at level + ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); + + const std::string kVal(100, 'a'); + for (int i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j < kNumEntriesPerFile; ++j) { + // Put common data ("key") at end to prevent delta encoding from + // compressing the key effectively + std::string key = ToString(i) + ToString(j) + "key"; + ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal)); + } + Flush(); + } + + // no compression at L0, so ratio is less than one + ASSERT_LT(CompressionRatioAtLevel(0), 1.0); + ASSERT_GT(CompressionRatioAtLevel(0), 0.0); + ASSERT_EQ(CompressionRatioAtLevel(1), -1.0); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); + // Data at L1 should be highly compressed thanks to Snappy and redundant data + // in values (ratio is 12.846 as of 4/19/2016). + ASSERT_GT(CompressionRatioAtLevel(1), 10.0); +} + +#endif // ROCKSDB_LITE + +class CountingUserTblPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingUserTblPropCollector"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{ + {"CountingUserTblPropCollector", message_}, {"Count", encoded}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, + SequenceNumber seq, uint64_t file_size) override { + ++count_; + return Status::OK(); + } + + virtual UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + std::string message_ = "Rocksdb"; + uint32_t count_ = 0; +}; + +class CountingUserTblPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + explicit CountingUserTblPropCollectorFactory( + uint32_t expected_column_family_id) + : expected_column_family_id_(expected_column_family_id), + num_created_(0) {} + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(expected_column_family_id_, context.column_family_id); + num_created_++; + return new CountingUserTblPropCollector(); + } + const char* Name() const override { + return "CountingUserTblPropCollectorFactory"; + } + void set_expected_column_family_id(uint32_t v) { + expected_column_family_id_ = v; + } + uint32_t expected_column_family_id_; + uint32_t num_created_; +}; + +class CountingDeleteTabPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingDeleteTabPropCollector"; } + + Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, + SequenceNumber seq, uint64_t file_size) override { + if (type == kEntryDelete) { + num_deletes_++; + } + return Status::OK(); + } + + bool NeedCompact() const override { return num_deletes_ > 10; } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + Status Finish(UserCollectedProperties* properties) override { + *properties = + UserCollectedProperties{{"num_delete", ToString(num_deletes_)}}; + return Status::OK(); + } + + private: + uint32_t num_deletes_ = 0; +}; + +class CountingDeleteTabPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + return new CountingDeleteTabPropCollector(); + } + const char* Name() const override { + return "CountingDeleteTabPropCollectorFactory"; + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = (1 << 30); + options.max_background_flushes = 0; + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(0); + options.table_properties_collector_factories[0] = collector_factory; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(4U, props.size()); + uint32_t sum = 0; + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") != + user_collected.end()); + ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb"); + ASSERT_TRUE(user_collected.find("Count") != user_collected.end()); + Slice key(user_collected.at("Count")); + uint32_t count; + ASSERT_TRUE(GetVarint32(&key, &count)); + sum += count; + } + ASSERT_EQ(10u + 11u + 12u + 13u, sum); + + ASSERT_GT(collector_factory->num_created_, 0U); + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_GT(collector_factory->num_created_, 0U); +} +#endif // ROCKSDB_LITE + +TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 3; + options.max_background_flushes = 0; + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(1); + options.table_properties_collector_factories[0] = collector_factory, + CreateAndReopenWithCF({"pikachu"}, options); + // Create 2 files + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(1, ToString(table * 100 + i), "val"); + } + Flush(1); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(1, ToString(table * 100 + i), "val"); + } + Flush(1); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_GT(collector_factory->num_created_, 0U); + + // Come back to write to default column family + collector_factory->num_created_ = 0; + collector_factory->set_expected_column_family_id(0); // default CF + // Create 4 tables in default column family + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_GT(collector_factory->num_created_, 0U); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.num_levels = 8; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 1000; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + if (NumTableFilesAtLevel(0) == 1) { + // Clear Level 0 so that when later flush a file with deletions, + // we don't trigger an organic compaction. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(kMaxKey * 2), "")); + Flush(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + { + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + ++c; + } + ASSERT_EQ(c, 200); + } + + Delete(Key(0)); + for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { + Delete(Key(i)); + } + Delete(Key(kMaxKey * 2)); + + Flush(); + dbfull()->TEST_WaitForCompact(); + + { + SetPerfLevel(kEnableCount); + perf_context.Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + } + ASSERT_EQ(c, 0); + ASSERT_LT(perf_context.internal_delete_skipped_count, 30u); + ASSERT_LT(perf_context.internal_key_skipped_count, 30u); + SetPerfLevel(kDisable); + } +} + +TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 10; + options.level0_slowdown_writes_trigger = 10; + options.level0_stop_writes_trigger = 10; + options.disable_auto_compactions = true; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 100; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), "")); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + for (int i = 1; i < kMaxKey - 1; i++) { + Delete(Key(i)); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + + // Restart the DB. Although number of files didn't reach + // options.level0_file_num_compaction_trigger, compaction should + // still be triggered because of the need-compaction hint. + options.disable_auto_compactions = false; + Reopen(options); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + { + SetPerfLevel(kEnableCount); + perf_context.Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + c++; + } + ASSERT_EQ(c, 2); + ASSERT_EQ(perf_context.internal_delete_skipped_count, 0); + // We iterate every key twice. Is it a bug? + ASSERT_LE(perf_context.internal_key_skipped_count, 2); + SetPerfLevel(kDisable); + } +} +#endif // ROCKSDB_LITE +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_sst_test.cc b/external/rocksdb/db/db_sst_test.cc new file mode 100644 index 0000000000..58da7a9cab --- /dev/null +++ b/external/rocksdb/db/db_sst_test.cc @@ -0,0 +1,1733 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/sst_file_writer.h" +#include "util/sst_file_manager_impl.h" +#include "port/port.h" + +namespace rocksdb { + +class DBSSTTest : public DBTestBase { + public: + DBSSTTest() : DBTestBase("/db_sst_test") {} +}; + +TEST_F(DBSSTTest, DontDeletePendingOutputs) { + Options options; + options.env = env_; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Every time we write to a table file, call FOF/POF with full DB scan. This + // will make sure our pending_outputs_ protection work correctly + std::function purge_obsolete_files_function = [&]() { + JobContext job_context(0); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /*force*/); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + job_context.Clean(); + }; + + env_->table_write_callback_ = &purge_obsolete_files_function; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put("a", "begin")); + ASSERT_OK(Put("z", "end")); + ASSERT_OK(Flush()); + } + + // If pending output guard does not work correctly, PurgeObsoleteFiles() will + // delete the file that Compaction is trying to create, causing this: error + // db/db_test.cc:975: IO error: + // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory + Compact("a", "b"); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, DontDeleteMovedFile) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // If the moved file is actually deleted (the move-safeguard in + // ~Version::Version() is not there), we get this failure: + // Corruption: Can't access /000009.sst + Reopen(options); +} + +// This reproduces a bug where we don't delete a file because when it was +// supposed to be deleted, it was blocked by pending_outputs +// Consider: +// 1. current file_number is 13 +// 2. compaction (1) starts, blocks deletion of all files starting with 13 +// (pending outputs) +// 3. file 13 is created by compaction (2) +// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file +// 13 has no references, it is put into VersionSet::obsolete_files_ +// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 +// is deleted from obsolete_files_ set. +// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by +// pending outputs since compaction (1) is still running. It is not deleted and +// it is not present in obsolete_files_ anymore. Therefore, we never delete it. +TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 2 * 1024 * 1024; // 2 MB + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + options.max_background_flushes = 2; + options.max_background_compactions = 2; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + Reopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + test::SleepingBackgroundTask blocking_thread; + port::Mutex mutex_; + bool already_blocked(false); + + // block the flush + std::function block_first_time = [&]() { + bool blocking = false; + { + MutexLock l(&mutex_); + if (!already_blocked) { + blocking = true; + already_blocked = true; + } + } + if (blocking) { + blocking_thread.DoSleep(); + } + }; + env_->table_write_callback_ = &block_first_time; + // Insert 2.5MB data, which should trigger a flush because we exceed + // write_buffer_size. The flush will be blocked with block_first_time + // pending_file is protecting all the files created after + for (int j = 0; j < 256; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); + } + blocking_thread.WaitUntilSleeping(); + + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto file_on_L2 = metadata[0].name; + listener->SetExpectedFileName(dbname_ + file_on_L2); + + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + + // finish the flush! + blocking_thread.WakeUp(); + blocking_thread.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0)); + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 2U); + + // This file should have been deleted during last compaction + ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); + listener->VerifyMatchedCount(1); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBSSTTest, DBWithSstFileManager) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { files_added++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { files_deleted++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* arg) { files_moved++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 25; i++) { + GenerateNewRandomFile(&rnd); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles()); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + auto files_in_db = GetAllSSTFiles(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + // We flushed at least 25 files + ASSERT_GE(files_added, 25); + // Compaction must have deleted some files + ASSERT_GT(files_deleted, 0); + // No files were moved + ASSERT_EQ(files_moved, 0); + + Close(); + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened + Close(); + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, RateLimitedDelete) { + Destroy(last_options_); + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DBSSTTest::RateLimitedDelete:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + + std::vector penalties; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.env = env_; + + std::string trash_dir = test::TmpDir(env_) + "/trash"; + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); + ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); + + ASSERT_OK(TryReopen(options)); + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + // Compaction will move the 4 files in L0 to trash and create 1 L1 file + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + uint64_t delete_start_time = env_->NowMicros(); + // Hold BackgroundEmptyTrash + TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1"); + sfm->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), metadata.size()); + for (size_t i = 0; i < metadata.size(); i++) { + total_files_size += metadata[i].size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Create a DB with 2 db_paths, and generate multiple files in the 2 +// db_paths using CompactRangeOptions, make sure that files that were +// deleted from first db_path were deleted using DeleteScheduler and +// files in the second path were not. +TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* arg) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.db_paths.emplace_back(dbname_, 1024 * 100); + options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100); + options.env = env_; + + std::string trash_dir = test::TmpDir(env_) + "/trash"; + int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec + Status s; + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); + ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); + + DestroyAndReopen(options); + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + Slice begin("Key0"); + Slice end("Key3"); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // Create 4 files in L0 + for (int i = 4; i < 8; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'))); + ASSERT_OK(Flush()); + } + ASSERT_EQ("4,1", FilesPerLevel(0)); + + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + begin = "Key4"; + end = "Key7"; + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,2", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 8); + + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 8); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* arg) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + // Close DB and destroy it using DeleteScheduler + Close(); + std::string trash_dir = test::TmpDir(env_) + "/trash"; + int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec + Status s; + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); + ASSERT_OK(s); + ASSERT_OK(DestroyDB(dbname_, options)); + + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->WaitForEmptyTrash(); + // We have deleted the 4 sst files in the delete_scheduler + ASSERT_EQ(bg_delete_file, 4); +} +#endif // ROCKSDB_LITE + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 100 keys. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + uint64_t first_file_size = 0; + auto files_in_db = GetAllSSTFiles(&first_file_size); + ASSERT_EQ(sfm->GetTotalSize(), first_file_size); + + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(first_file_size + 1); + + ASSERT_OK(Put("key1", "val1")); + // This flush will cause bg_error_ and will fail + ASSERT_NOK(Flush()); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { + // This test will set a maximum allowed space for the DB, then it will + // keep filling the DB until the limit is reached and bg_error_ is set. + // When bg_error_ is set we will verify that the DB size is greater + // than the limit. + + std::vector max_space_limits_mbs = {1, 2, 4, 8, 10}; + + bool bg_error_set = false; + uint64_t total_sst_files_size = 0; + + int reached_max_space_on_flush = 0; + int reached_max_space_on_compaction = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + bg_error_set = true; + GetAllSSTFiles(&total_sst_files_size); + reached_max_space_on_flush++; + }); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + bg_error_set = true; + GetAllSSTFiles(&total_sst_files_size); + reached_max_space_on_compaction++; + }); + + for (auto limit_mb : max_space_limits_mbs) { + bg_error_set = false; + total_sst_files_size = 0; + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.write_buffer_size = 1024 * 512; // 512 Kb + DestroyAndReopen(options); + Random rnd(301); + + sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); + + int keys_written = 0; + uint64_t estimated_db_size = 0; + while (true) { + auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); + if (!s.ok()) { + break; + } + keys_written++; + // Check the estimated db size vs the db limit just to make sure we + // dont run into an infinite loop + estimated_db_size = keys_written * 60; // ~60 bytes per key + ASSERT_LT(estimated_db_size, limit_mb * 1024 * 1024 * 2); + } + ASSERT_TRUE(bg_error_set); + ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } + + ASSERT_GT(reached_max_space_on_flush, 0); + ASSERT_GT(reached_max_space_on_compaction, 0); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) { + // Open DB with infinite max open files + // - First iteration use 1 thread to open files + // - Second iteration use 5 threads to open files + for (int iter = 0; iter < 2; iter++) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 100000; + options.disable_auto_compactions = true; + options.max_open_files = -1; + if (iter == 0) { + options.max_file_opening_threads = 1; + } else { + options.max_file_opening_threads = 5; + } + options = CurrentOptions(options); + DestroyAndReopen(options); + + // Create 12 Files in L0 (then move then to L2) + for (int i = 0; i < 12; i++) { + std::string k = "L2_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + db_->CompactRange(compact_options, nullptr, nullptr); + + // Create 12 Files in L0 + for (int i = 0; i < 12; i++) { + std::string k = "L0_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + Close(); + + // Reopening the DB will load all exisitng files + Reopen(options); + ASSERT_EQ("12,0,12", FilesPerLevel(0)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + + for (const auto& level : files) { + for (const auto& file : level) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } + } + + for (int i = 0; i < 12; i++) { + ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a')); + ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a')); + } + } +} + +TEST_F(DBSSTTest, GetTotalSstFilesSize) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 10; j++) { + std::string val = "val_file_" + ToString(i); + ASSERT_OK(Put(Key(j), val)); + } + Flush(); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + // Compact 5 files into 1 file in L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 1); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 1 (compacted file) + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(live_sst_files_size, 1 * single_file_size); + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + Flush(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + iter1.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 1 (compacted file) + ASSERT_EQ(total_sst_files_size, 1 * single_file_size); + + iter2.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); +} + +TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i), "val")); + Flush(); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + // Compaction will do trivial move from L0 to L1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,5", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete(Key(i))); + } + Flush(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + iter1.reset(); + iter2.reset(); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); +} + +TEST_F(DBSSTTest, AddExternalSstFile) { + do { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + // file1.sst (0 => 99) + std::string file1 = sst_files_folder + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Add(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // file2.sst (100 => 199) + std::string file2 = sst_files_folder + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + s = sst_file_writer.Add(Key(99), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 299) + // This file values overlap with file2 values + std::string file3 = sst_files_folder + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 300; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 105); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(299)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_folder + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (400 => 499) + std::string file5 = sst_files_folder + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 400; k < 500; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(400)); + ASSERT_EQ(file5_info.largest_key, Key(499)); + + // Cannot create an empty sst file + std::string file_empty = sst_files_folder + "file_empty.sst"; + ExternalSstFileInfo file_empty_info; + s = sst_file_writer.Finish(&file_empty_info); + ASSERT_NOK(s); + + DestroyAndReopen(options); + // Add file using file path + s = db_->AddFile(std::vector(1, file1)); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(db_->AddFile(std::vector(1, file2_info))); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(db_->AddFile(std::vector(1, file2_info))); + + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // This file has overlapping values with the exisitng data + s = db_->AddFile(std::vector(1, file3)); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // This file has overlapping values with the exisitng data + s = db_->AddFile(std::vector(1, file4_info)); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Key range of file5 (400 => 499) dont overlap with any keys in DB + ASSERT_OK(db_->AddFile(std::vector(1, file5))); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + // Delete keys in range (400 => 499) + for (int k = 400; k < 500; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (400 => 499) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(db_->AddFile(std::vector(1, file5))); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(db_->AddFile(std::vector(1, file5))); + + // Verify values of file5 in DB + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBSSTTest, AddExternalSstFileList) { + do { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + // file1.sst (0 => 99) + std::string file1 = sst_files_folder + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Add(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // file2.sst (100 => 199) + std::string file2 = sst_files_folder + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + s = sst_file_writer.Add(Key(99), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 199) + // This file values overlap with file2 values + std::string file3 = sst_files_folder + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 200; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 5); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(199)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_folder + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (200 => 299) + std::string file5 = sst_files_folder + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 200; k < 300; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(200)); + ASSERT_EQ(file5_info.largest_key, Key(299)); + + // list 1 has internal key range conflict + std::vector file_list0({file1, file2}); + std::vector file_list1({file3, file2, file1}); + std::vector file_list2({file5}); + std::vector file_list3({file3, file4}); + + std::vector info_list0({file1_info, file2_info}); + std::vector info_list1( + {file3_info, file2_info, file1_info}); + std::vector info_list2({file5_info}); + std::vector info_list3({file3_info, file4_info}); + + DestroyAndReopen(options); + + // This list of files have key ranges are overlapping with each other + s = db_->AddFile(file_list1); + ASSERT_FALSE(s.ok()) << s.ToString(); + s = db_->AddFile(info_list1); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Add files using file path list + s = db_->AddFile(file_list0); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(db_->AddFile(info_list2)); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(db_->AddFile(info_list2)); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // This file list has overlapping values with the exisitng data + s = db_->AddFile(file_list3); + ASSERT_FALSE(s.ok()) << s.ToString(); + s = db_->AddFile(info_list3); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 200; k < 300; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + // Delete keys in range (200 => 299) + for (int k = 200; k < 300; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (200 => 299) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(db_->AddFile(file_list2)); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(db_->AddFile(file_list2)); + + // Verify values of file5 in DB + for (int k = 200; k < 300; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBSSTTest, AddExternalSstFileListAtomicity) { + do { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + // files[0].sst (0 => 99) + // files[1].sst (100 => 199) + // ... + // file[8].sst (800 => 899) + int n = 9; + std::vector files(n); + std::vector files_info(n); + for (int i = 0; i < n; i++) { + files[i] = sst_files_folder + "file" + std::to_string(i) + ".sst"; + ASSERT_OK(sst_file_writer.Open(files[i])); + for (int k = i * 100; k < (i + 1) * 100; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + Status s = sst_file_writer.Finish(&files_info[i]); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(files_info[i].file_path, files[i]); + ASSERT_EQ(files_info[i].num_entries, 100); + ASSERT_EQ(files_info[i].smallest_key, Key(i * 100)); + ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1)); + } + files.push_back(sst_files_folder + "file" + std::to_string(n) + ".sst"); + auto s = db_->AddFile(files); + ASSERT_NOK(s) << s.ToString(); + for (int k = 0; k < n * 100; k++) { + ASSERT_EQ("NOT_FOUND", Get(Key(k))); + } + s = db_->AddFile(files_info); + ASSERT_OK(s); + for (int k = 0; k < n * 100; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} +// This test reporduce a bug that can happen in some cases if the DB started +// purging obsolete files when we are adding an external sst file. +// This situation may result in deleting the file while it's being added. +TEST_F(DBSSTTest, AddExternalSstFilePurgeObsoleteFilesBug) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + // file1.sst (0 => 500) + std::string sst_file_path = sst_files_folder + "file1.sst"; + Status s = sst_file_writer.Open(sst_file_path); + ASSERT_OK(s); + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + s = sst_file_writer.Add(k, k + "_val"); + ASSERT_OK(s); + } + + ExternalSstFileInfo sst_file_info; + s = sst_file_writer.Finish(&sst_file_info); + ASSERT_OK(s); + + options.delete_obsolete_files_period_micros = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AddFile:FileCopied", [&](void* arg) { + ASSERT_OK(Put("aaa", "bbb")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("aaa", "xxx")); + ASSERT_OK(Flush()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + s = db_->AddFile(std::vector(1, sst_file_path)); + ASSERT_OK(s); + + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + std::string v = k + "_val"; + ASSERT_EQ(Get(k), v); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, AddExternalSstFileNoCopy) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + // file1.sst (0 => 99) + std::string file1 = sst_files_folder + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + + // file2.sst (100 => 299) + std::string file2 = sst_files_folder + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 300; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(299)); + + // file3.sst (110 => 124) .. overlap with file2.sst + std::string file3 = sst_files_folder + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 110; k < 125; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 15); + ASSERT_EQ(file3_info.smallest_key, Key(110)); + ASSERT_EQ(file3_info.largest_key, Key(124)); + s = db_->AddFile(std::vector(1, file1_info), + true /* move file */); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); + + s = db_->AddFile(std::vector(1, file2_info), + false /* copy file */); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(env_->FileExists(file2)); + + // This file have overlapping values with the exisitng data + s = db_->AddFile(std::vector(1, file2_info), + true /* move file */); + ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_OK(env_->FileExists(file3)); + + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } +} + +TEST_F(DBSSTTest, AddExternalSstFileMultiThreaded) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + // Bulk load 10 files every file contain 1000 keys + int num_files = 10; + int keys_per_file = 1000; + + // Generate file names + std::vector file_names; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file_" + ToString(i) + ".sst"; + file_names.push_back(sst_files_folder + file_name); + } + + do { + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + + std::atomic thread_num(0); + std::function write_file_func = [&]() { + int file_idx = thread_num.fetch_add(1); + int range_start = file_idx * keys_per_file; + int range_end = range_start + keys_per_file; + + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + ASSERT_OK(sst_file_writer.Open(file_names[file_idx])); + + for (int k = range_start; k < range_end; k++) { + ASSERT_OK(sst_file_writer.Add(Key(k), Key(k))); + } + + Status s = sst_file_writer.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + }; + // Write num_files files in parallel + std::vector sst_writer_threads; + for (int i = 0; i < num_files; ++i) { + sst_writer_threads.emplace_back(write_file_func); + } + + for (auto& t : sst_writer_threads) { + t.join(); + } + + fprintf(stderr, "Wrote %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + thread_num.store(0); + std::atomic files_added(0); + std::function load_file_func = [&]() { + // We intentionally add every file twice, and assert that it was added + // only once and the other add failed + int thread_id = thread_num.fetch_add(1); + int file_idx = thread_id / 2; + // sometimes we use copy, sometimes link .. the result should be the same + bool move_file = (thread_id % 3 == 0); + + Status s = db_->AddFile(std::vector(1, file_names[file_idx]), + move_file); + if (s.ok()) { + files_added++; + } + }; + // Bulk load num_files files in parallel + std::vector add_file_threads; + DestroyAndReopen(options); + for (int i = 0; i < num_files * 2; ++i) { + add_file_threads.emplace_back(load_file_func); + } + + for (auto& t : add_file_threads) { + t.join(); + } + ASSERT_EQ(files_added.load(), num_files); + fprintf(stderr, "Loaded %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + // Overwrite values of keys divisible by 100 + for (int k = 0; k < num_files * keys_per_file; k += 100) { + std::string key = Key(k); + Status s = Put(key, key + "_new"); + ASSERT_TRUE(s.ok()); + } + + for (int i = 0; i < 2; i++) { + // Make sure the values are correct before and after flush/compaction + for (int k = 0; k < num_files * keys_per_file; ++k) { + std::string key = Key(k); + std::string value = (k % 100 == 0) ? (key + "_new") : key; + ASSERT_EQ(Get(key), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + fprintf(stderr, "Verified %d values\n", num_files * keys_per_file); + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBSSTTest, AddExternalSstFileOverlappingRanges) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + Random rnd(301); + do { + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + printf("Option config = %d\n", option_config_); + std::vector> key_ranges; + for (int i = 0; i < 500; i++) { + int range_start = rnd.Uniform(20000); + int keys_per_range = 10 + rnd.Uniform(41); + + key_ranges.emplace_back(range_start, range_start + keys_per_range); + } + + int memtable_add = 0; + int success_add_file = 0; + int failed_add_file = 0; + std::map true_data; + for (size_t i = 0; i < key_ranges.size(); i++) { + int range_start = key_ranges[i].first; + int range_end = key_ranges[i].second; + + Status s; + std::string range_val = "range_" + ToString(i); + + // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile + if (i && i % 5 == 0) { + // Use DB::Put to insert range (insert into memtable) + range_val += "_put"; + for (int k = range_start; k <= range_end; k++) { + s = Put(Key(k), range_val); + ASSERT_OK(s); + } + memtable_add++; + } else { + // Use DB::AddFile to insert range + range_val += "_add_file"; + + // Generate the file containing the range + std::string file_name = sst_files_folder + env_->GenerateUniqueId(); + ASSERT_OK(sst_file_writer.Open(file_name)); + for (int k = range_start; k <= range_end; k++) { + s = sst_file_writer.Add(Key(k), range_val); + ASSERT_OK(s); + } + ExternalSstFileInfo file_info; + s = sst_file_writer.Finish(&file_info); + ASSERT_OK(s); + + // Insert the generated file + s = db_->AddFile(std::vector(1, file_info)); + + auto it = true_data.lower_bound(Key(range_start)); + if (it != true_data.end() && it->first <= Key(range_end)) { + // This range overlap with data already exist in DB + ASSERT_NOK(s); + failed_add_file++; + } else { + ASSERT_OK(s); + success_add_file++; + } + } + + if (s.ok()) { + // Update true_data map to include the new inserted data + for (int k = range_start; k <= range_end; k++) { + true_data[Key(k)] = range_val; + } + } + + // Flush / Compact the DB + if (i && i % 50 == 0) { + Flush(); + } + if (i && i % 75 == 0) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + } + + printf( + "Total: %" ROCKSDB_PRIszt " ranges\n" + "AddFile()|Success: %d ranges\n" + "AddFile()|RangeConflict: %d ranges\n" + "Put(): %d ranges\n", + key_ranges.size(), success_add_file, failed_add_file, memtable_add); + + // Verify the correctness of the data + for (const auto& kv : true_data) { + ASSERT_EQ(Get(kv.first), kv.second); + } + printf("keys/values verified\n"); + } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBSSTTest, AddExternalSstFilePickedLevel) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.env = env_; + DestroyAndReopen(options); + + std::vector> file_to_keys; + + // File 0 will go to last level (L3) + file_to_keys.push_back({1, 10}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + EXPECT_EQ(FilesPerLevel(), "0,0,0,1"); + + // File 1 will go to level L2 (since it overlap with file 0 in L3) + file_to_keys.push_back({2, 9}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + EXPECT_EQ(FilesPerLevel(), "0,0,1,1"); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DBSSTTest::AddExternalSstFilePickedLevel:0", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Start", + "DBSSTTest::AddExternalSstFilePickedLevel:1"}, + {"DBSSTTest::AddExternalSstFilePickedLevel:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Flush 4 files containing the same keys + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put(Key(3), Key(3) + "put")); + ASSERT_OK(Put(Key(8), Key(8) + "put")); + ASSERT_OK(Flush()); + } + + // Wait for BackgroundCompaction() to be called + TEST_SYNC_POINT("DBSSTTest::AddExternalSstFilePickedLevel:0"); + TEST_SYNC_POINT("DBSSTTest::AddExternalSstFilePickedLevel:1"); + + EXPECT_EQ(FilesPerLevel(), "4,0,1,1"); + + // This file overlaps with file 0 (L3), file 1 (L2) and the + // output of compaction going to L1 + file_to_keys.push_back({4, 7}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + EXPECT_EQ(FilesPerLevel(), "5,0,1,1"); + + // This file does not overlap with any file or with the running compaction + file_to_keys.push_back({9000, 9001}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + EXPECT_EQ(FilesPerLevel(), "5,0,1,2"); + + // Hold compaction from finishing + TEST_SYNC_POINT("DBSSTTest::AddExternalSstFilePickedLevel:2"); + + dbfull()->TEST_WaitForCompact(); + EXPECT_EQ(FilesPerLevel(), "1,1,1,2"); + + for (size_t file_id = 0; file_id < file_to_keys.size(); file_id++) { + for (auto& key_id : file_to_keys[file_id]) { + std::string k = Key(key_id); + std::string v = k + ToString(file_id); + if (key_id == 3 || key_id == 8) { + v = k + "put"; + } + + ASSERT_EQ(Get(k), v); + } + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, AddExternalSstFilePickedLevelDynamic) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 4; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = 4; + options.env = env_; + DestroyAndReopen(options); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DBSSTTest::AddExternalSstFilePickedLevelDynamic:0", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Start", + "DBSSTTest::AddExternalSstFilePickedLevelDynamic:1"}, + {"DBSSTTest::AddExternalSstFilePickedLevelDynamic:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Flush 4 files containing the same keys + for (int i = 0; i < 4; i++) { + for (int k = 20; k <= 30; k++) { + ASSERT_OK(Put(Key(k), Key(k) + "put")); + } + for (int k = 50; k <= 60; k++) { + ASSERT_OK(Put(Key(k), Key(k) + "put")); + } + ASSERT_OK(Flush()); + } + + // Wait for BackgroundCompaction() to be called + TEST_SYNC_POINT("DBSSTTest::AddExternalSstFilePickedLevelDynamic:0"); + TEST_SYNC_POINT("DBSSTTest::AddExternalSstFilePickedLevelDynamic:1"); + std::vector> file_to_keys; + + // This file overlaps with the output of the compaction (going to L3) + // so the file will be added to L0 since L3 is the base level + file_to_keys.push_back({31, 32, 33, 34}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + EXPECT_EQ(FilesPerLevel(), "5"); + + // This file does not overlap with the current running compactiong + file_to_keys.push_back({9000, 9001}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + EXPECT_EQ(FilesPerLevel(), "5,0,0,1"); + + // Hold compaction from finishing + TEST_SYNC_POINT("DBSSTTest::AddExternalSstFilePickedLevelDynamic:2"); + + // Output of the compaction will go to L3 + dbfull()->TEST_WaitForCompact(); + EXPECT_EQ(FilesPerLevel(), "1,0,0,2"); + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + file_to_keys.push_back({1, 15, 19}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,3"); + + file_to_keys.push_back({1000, 1001, 1002}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,4"); + + file_to_keys.push_back({500, 600, 700}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,5"); + + // File 5 overlaps with file 2 (L3 / base level) + file_to_keys.push_back({2, 10}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + ASSERT_EQ(FilesPerLevel(), "2,0,0,5"); + + // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0) + file_to_keys.push_back({3, 9}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + ASSERT_EQ(FilesPerLevel(), "3,0,0,5"); + + // Verify data in files + for (size_t file_id = 0; file_id < file_to_keys.size(); file_id++) { + for (auto& key_id : file_to_keys[file_id]) { + std::string k = Key(key_id); + std::string v = k + ToString(file_id); + + ASSERT_EQ(Get(k), v); + } + } + + // Write range [5 => 10] to L0 + for (int i = 5; i <= 10; i++) { + std::string k = Key(i); + std::string v = k + "put"; + ASSERT_OK(Put(k, v)); + } + ASSERT_OK(Flush()); + ASSERT_EQ(FilesPerLevel(), "4,0,0,5"); + + // File 7 overlaps with file 4 (L3) + file_to_keys.push_back({650, 651, 652}); + ASSERT_OK(GenerateAndAddExternalFile(options, file_to_keys.back(), + file_to_keys.size() - 1)); + ASSERT_EQ(FilesPerLevel(), "5,0,0,5"); + + for (size_t file_id = 0; file_id < file_to_keys.size(); file_id++) { + for (auto& key_id : file_to_keys[file_id]) { + std::string k = Key(key_id); + std::string v = k + ToString(file_id); + if (key_id >= 5 && key_id <= 10) { + v = k + "put"; + } + + ASSERT_EQ(Get(k), v); + } + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // ROCKSDB_LITE + +// 1 Create some SST files by inserting K-V pairs into DB +// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file +// 3 Open DB and check if all key can be read +TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.num_levels = 4; + DestroyAndReopen(options); + + Random rnd(301); + int key_id = 0; + for (int i = 0; i < 10; ++i) { + GenerateNewFile(&rnd, &key_id, false); + } + Flush(); + Close(); + int const num_files = GetSstFileCount(dbname_); + ASSERT_GT(num_files, 0); + + std::vector filenames; + GetSstFiles(dbname_, &filenames); + int num_ldb_files = 0; + for (size_t i = 0; i < filenames.size(); ++i) { + if (i & 1) { + continue; + } + std::string const rdb_name = dbname_ + "/" + filenames[i]; + std::string const ldb_name = Rocks2LevelTableFileName(rdb_name); + ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok()); + ++num_ldb_files; + } + ASSERT_GT(num_ldb_files, 0); + ASSERT_EQ(num_files, GetSstFileCount(dbname_)); + + Reopen(options); + for (int k = 0; k < key_id; ++k) { + ASSERT_NE("NOT_FOUND", Get(Key(k))); + } + Destroy(options); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_table_properties_test.cc b/external/rocksdb/db/db_table_properties_test.cc new file mode 100644 index 0000000000..de2ace0d2f --- /dev/null +++ b/external/rocksdb/db/db_table_properties_test.cc @@ -0,0 +1,260 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "util/testharness.h" +#include "util/testutil.h" + +#ifndef ROCKSDB_LITE + +namespace rocksdb { + +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is different for each of the tables. +namespace { + +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); +} +} // namespace + +class DBTablePropertiesTest : public DBTestBase { + public: + DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {} + TablePropertiesCollection TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties = nullptr, + std::size_t* num_files = nullptr); +}; + +TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + // 1. Read table properties directly from file + Reopen(options); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(ToString(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 4; ++i) { + Get(ToString(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); +} + +TablePropertiesCollection +DBTablePropertiesTest::TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties, + std::size_t* num_files) { + + // Since we deref zero element in the vector it can not be empty + // otherwise we pass an address to some random memory + EXPECT_GT(ranges.size(), 0U); + // run the query + TablePropertiesCollection props; + EXPECT_OK(db_->GetPropertiesOfTablesInRange( + db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props)); + + // Make sure that we've received properties for those and for those files + // only which fall within requested ranges + std::vector vmd; + db_->GetLiveFilesMetaData(&vmd); + for (auto& md : vmd) { + std::string fn = md.db_path + md.name; + bool in_range = false; + for (auto& r : ranges) { + // smallestkey < limit && largestkey >= start + if (r.limit.compare(md.smallestkey) >= 0 && + r.start.compare(md.largestkey) <= 0) { + in_range = true; + EXPECT_GT(props.count(fn), 0); + } + } + if (!in_range) { + EXPECT_EQ(props.count(fn), 0); + } + } + + if (num_properties) { + *num_properties = props.size(); + } + + if (num_files) { + *num_files = vmd.size(); + } + return props; +} + +TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { + // Fixed random sead + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 3; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.hard_pending_compaction_bytes_limit = 16 * 1024; + options.num_levels = 8; + + DestroyAndReopen(options); + + // build a decent LSM + for (int i = 0; i < 10000; i++) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + if (NumTableFilesAtLevel(0) == 0) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); + Flush(); + } + + db_->PauseBackgroundWork(); + + // Ensure that we have at least L0, L1 and L2 + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + + // Query the largest range + std::size_t num_properties, num_files; + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_EQ(num_properties, num_files); + + // Query the empty range + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_EQ(num_properties, 0); + + // Query the middle rangee + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_GT(num_files, num_properties); + ASSERT_GT(num_properties, 0); + + // Query a bunch of random ranges + for (int j = 0; j < 100; j++) { + // create a bunch of ranges + std::vector random_keys; + // Random returns numbers with zero included + // when we pass empty ranges TestGetPropertiesOfTablesInRange() + // derefs random memory in the empty ranges[0] + // so want to be greater than zero and even since + // the below loop requires that random_keys.size() to be even. + auto n = 2 * (rnd.Uniform(50) + 1); + + for (uint32_t i = 0; i < n; ++i) { + random_keys.push_back(test::RandomKey(&rnd, 5)); + } + + ASSERT_GT(random_keys.size(), 0U); + ASSERT_EQ((random_keys.size() % 2), 0U); + + std::vector ranges; + auto it = random_keys.begin(); + while (it != random_keys.end()) { + ranges.push_back(Range(*it, *(it + 1))); + it += 2; + } + + TestGetPropertiesOfTablesInRange(std::move(ranges)); + } +} + +TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) { + std::string kExtraCfName = "pikachu"; + CreateAndReopenWithCF({kExtraCfName}, CurrentOptions()); + + // Create one table per CF, then verify it was created with the column family + // name property. + for (int cf = 0; cf < 2; ++cf) { + Put(cf, "key", "val"); + Flush(cf); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + std::string expected_cf_name; + if (cf > 0) { + expected_cf_name = kExtraCfName; + } else { + expected_cf_name = kDefaultColumnFamilyName; + } + ASSERT_EQ(expected_cf_name, + fname_to_props.begin()->second->column_family_name); + ASSERT_EQ(cf, static_cast( + fname_to_props.begin()->second->column_family_id)); + } +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/db_tailing_iter_test.cc b/external/rocksdb/db/db_tailing_iter_test.cc index 4ca5e90181..bfb62926e1 100644 --- a/external/rocksdb/db/db_tailing_iter_test.cc +++ b/external/rocksdb/db/db_tailing_iter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,11 +10,11 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) +#include "db/db_test_util.h" #include "db/forward_iterator.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" namespace rocksdb { @@ -140,6 +140,8 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { std::unique_ptr iterh(db_->NewIterator(read_options, handles_[1])); std::string value(1024, 'a'); bool file_iters_deleted = false; + bool file_iters_renewed_null = false; + bool file_iters_renewed_copy = false; rocksdb::SyncPoint::GetInstance()->SetCallBack( "ForwardIterator::SeekInternal:Return", [&](void* arg) { ForwardIterator* fiter = reinterpret_cast(arg); @@ -152,6 +154,12 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { ASSERT_TRUE(!file_iters_deleted || fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::RenewIterators:Null", + [&](void* arg) { file_iters_renewed_null = true; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::RenewIterators:Copy", + [&](void* arg) { file_iters_renewed_copy = true; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); const int num_records = 1000; for (int i = 1; i < num_records; ++i) { @@ -160,7 +168,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { char buf3[32]; char buf4[32]; snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); - snprintf(buf3, sizeof(buf1), "00b0%016d", i * 5); + snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5); Slice key(buf1, 20); ASSERT_OK(Put(1, key, value)); @@ -173,7 +181,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { if (i == 299) { file_iters_deleted = true; } - snprintf(buf4, sizeof(buf1), "00a0%016d", i * 5 / 2); + snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2); Slice target(buf4, 20); iterh->Seek(target); ASSERT_TRUE(iter->Valid()); @@ -203,6 +211,8 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { ASSERT_LE(num_iters, 1); file_iters_deleted = false; } + ASSERT_TRUE(file_iters_renewed_null); + ASSERT_TRUE(file_iters_renewed_copy); iter = 0; itern = 0; iterh = 0; @@ -644,12 +654,57 @@ TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) { ASSERT_EQ(found, iter->key().ToString()); } +TEST_F(DBTestTailingIterator, ForwardIteratorVersionProperty) { + Options options = CurrentOptions(); + options.write_buffer_size = 1000; + + ReadOptions read_options; + read_options.tailing = true; + + Put("foo", "bar"); + + uint64_t v1, v2, v3, v4; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("foo"); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v1 = static_cast(std::atoi(prop_value.c_str())); + + Put("foo1", "bar1"); + Flush(); + + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v2 = static_cast(std::atoi(prop_value.c_str())); + + iter->Seek("f"); + + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v3 = static_cast(std::atoi(prop_value.c_str())); + + ASSERT_EQ(v1, v2); + ASSERT_GT(v3, v2); + } + + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("foo"); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v4 = static_cast(std::atoi(prop_value.c_str())); + } + ASSERT_EQ(v3, v4); +} } // namespace rocksdb -#endif // !(defined NDEBUG) || !defined(OS_WIN) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/external/rocksdb/db/db_test.cc b/external/rocksdb/db/db_test.cc index 4bfe4dbd24..5bcc6a2136 100644 --- a/external/rocksdb/db/db_test.cc +++ b/external/rocksdb/db/db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,31 +10,32 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !(defined NDEBUG) || !defined(OS_WIN) - +#include #include -#include #include #include #include #include -#include #ifndef OS_WIN #include #endif +#ifdef OS_SOLARIS +#include +#endif -#include "db/filename.h" -#include "db/dbformat.h" #include "db/db_impl.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/filename.h" #include "db/job_context.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "memtable/hash_linklist_rep.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" #include "rocksdb/experimental.h" #include "rocksdb/filter_policy.h" @@ -43,217 +44,178 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/snapshot.h" -#include "rocksdb/sst_file_writer.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/thread_status.h" -#include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" #include "table/block_based_table_factory.h" #include "table/mock_table.h" #include "table/plain_table_factory.h" -#include "util/db_test_util.h" +#include "table/scoped_arena_iterator.h" +#include "util/compression.h" #include "util/file_reader_writer.h" #include "util/hash.h" -#include "util/hash_linklist_rep.h" -#include "utilities/merge_operators.h" #include "util/logging.h" -#include "util/compression.h" +#include "util/mock_env.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" -#include "util/statistics.h" -#include "util/testharness.h" -#include "util/scoped_arena_iterator.h" +#include "util/string_util.h" #include "util/sync_point.h" +#include "util/testharness.h" #include "util/testutil.h" -#include "util/mock_env.h" -#include "util/string_util.h" #include "util/thread_status_util.h" #include "util/xfunc.h" +#include "utilities/merge_operators.h" namespace rocksdb { -static long TestGetTickerCount(const Options& options, Tickers ticker_type) { - return options.statistics->getTickerCount(ticker_type); -} - -// A helper function that ensures the table properties returned in -// `GetPropertiesOfAllTablesTest` is correct. -// This test assumes entries size is different for each of the tables. -namespace { - -void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { - TablePropertiesCollection props; - ASSERT_OK(db->GetPropertiesOfAllTables(&props)); - - ASSERT_EQ(4U, props.size()); - std::unordered_set unique_entries; - - // Indirect test - uint64_t sum = 0; - for (const auto& item : props) { - unique_entries.insert(item.second->num_entries); - sum += item.second->num_entries; - } - - ASSERT_EQ(props.size(), unique_entries.size()); - ASSERT_EQ(expected_entries_size, sum); -} - -uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, - std::string column_family_name) { - std::vector metadata; - db->GetLiveFilesMetaData(&metadata); - uint64_t result = 0; - for (auto& fileMetadata : metadata) { - result += (fileMetadata.column_family_name == column_family_name); - } - return result; -} - -} // namespace - class DBTest : public DBTestBase { public: DBTest() : DBTestBase("/db_test") {} }; -class DBTestWithParam : public DBTest, - public testing::WithParamInterface { +class DBTestWithParam + : public DBTest, + public testing::WithParamInterface> { public: - DBTestWithParam() { max_subcompactions_ = GetParam(); } + DBTestWithParam() { + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } // Required if inheriting from testing::WithParamInterface<> static void SetUpTestCase() {} static void TearDownTestCase() {} uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; }; -class BloomStatsTestWithParam - : public DBTest, - public testing::WithParamInterface> { - public: - BloomStatsTestWithParam() { - use_block_table_ = std::get<0>(GetParam()); - use_block_based_builder_ = std::get<1>(GetParam()); - - options_.create_if_missing = true; - options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4)); - options_.memtable_prefix_bloom_bits = 8 * 1024; - if (use_block_table_) { - BlockBasedTableOptions table_options; - table_options.hash_index_allow_collision = false; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder_)); - options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); - } else { - PlainTableOptions table_options; - options_.table_factory.reset(NewPlainTableFactory(table_options)); - } +TEST_F(DBTest, MockEnvTest) { + unique_ptr env{new MockEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; - perf_context.Reset(); - DestroyAndReopen(options_); + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); } - ~BloomStatsTestWithParam() { - perf_context.Reset(); - Destroy(options_); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); } - // Required if inheriting from testing::WithParamInterface<> - static void SetUpTestCase() {} - static void TearDownTestCase() {} + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; - bool use_block_table_; - bool use_block_based_builder_; - Options options_; -}; +// TEST_FlushMemTable() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); -TEST_F(DBTest, Empty) { - do { - Options options; - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, options); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } +#endif // ROCKSDB_LITE + + delete db; +} - std::string num; - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("0", num); +// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't +// defined. +#ifndef ROCKSDB_LITE +TEST_F(DBTest, MemEnvTest) { + unique_ptr env{NewMemEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("1", num); + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; - // Block sync calls - env_->delay_sstable_sync_.store(true, std::memory_order_release); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("2", num); + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } - Put(1, "k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("1", num); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } - ASSERT_EQ("v1", Get(1, "foo")); - // Release sync calls - env_->delay_sstable_sync_.store(false, std::memory_order_release); + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; - ASSERT_OK(db_->DisableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("1", num); - - ASSERT_OK(db_->DisableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("2", num); - - ASSERT_OK(db_->DisableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("3", num); - - ASSERT_OK(db_->EnableFileDeletions(false)); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("2", num); - - ASSERT_OK(db_->EnableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("0", num); - } while (ChangeOptions()); + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; + + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + delete db; } +#endif // ROCKSDB_LITE TEST_F(DBTest, WriteEmptyBatch) { - Options options; + Options options = CurrentOptions(); options.env = env_; options.write_buffer_size = 100000; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "bar")); - env_->sync_counter_.store(0); WriteOptions wo; wo.sync = true; wo.disableWAL = false; WriteBatch empty_batch; ASSERT_OK(dbfull()->Write(wo, &empty_batch)); - ASSERT_GE(env_->sync_counter_.load(), 1); // make sure we can re-open it. ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); ASSERT_EQ("bar", Get(1, "foo")); } +#ifndef ROCKSDB_LITE TEST_F(DBTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); @@ -288,13 +250,12 @@ TEST_F(DBTest, ReadOnlyDB) { TEST_F(DBTest, CompactedDB) { const uint64_t kFileSize = 1 << 20; - Options options; + Options options = CurrentOptions(); options.disable_auto_compactions = true; options.write_buffer_size = kFileSize; options.target_file_size_base = kFileSize; options.max_bytes_for_level_base = 1 << 30; options.compression = kNoCompression; - options = CurrentOptions(options); Reopen(options); // 1 L0 file, use CompactedDB if max_open_files = -1 ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); @@ -361,7 +322,8 @@ TEST_F(DBTest, CompactedDB) { // MultiGet std::vector values; - std::vector status_list = dbfull()->MultiGet(ReadOptions(), + std::vector status_list = dbfull()->MultiGet( + ReadOptions(), std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), Slice("ggg"), Slice("iii"), Slice("kkk")}), &values); @@ -376,489 +338,15 @@ TEST_F(DBTest, CompactedDB) { ASSERT_OK(status_list[4]); ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]); ASSERT_TRUE(status_list[5].IsNotFound()); -} - -// Make sure that when options.block_cache is set, after a new table is -// created its index/filter blocks are added to block cache. -TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "key", "val")); - // Create a new table. - ASSERT_OK(Flush(1)); - - // index/filter blocks added to block cache right after table creation. - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(2, /* only index/filter were added */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); - uint64_t int_num; - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); - - // Make sure filter block is in cache. - std::string value; - ReadOptions ropt; - db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); - - // Miss count should remain the same. - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - // Make sure index block is in cache. - auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - value = Get(1, "key"); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(index_block_hit + 1, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - - value = Get(1, "key"); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(index_block_hit + 2, - TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); -} - -TEST_F(DBTest, ParanoidFileChecks) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.level0_file_num_compaction_trigger = 2; - options.paranoid_file_checks = true; - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = false; - table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "1_key", "val")); - ASSERT_OK(Put(1, "9_key", "val")); - // Create a new table. - ASSERT_OK(Flush(1)); - ASSERT_EQ(1, /* read and cache data block */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Put(1, "1_key2", "val2")); - ASSERT_OK(Put(1, "9_key2", "val2")); - // Create a new SST file. This will further trigger a compaction - // and generate another file. - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(3, /* Totally 3 files created up to now */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - // After disabling options.paranoid_file_checks. NO further block - // is added after generating a new file. - ASSERT_OK( - dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}})); - - ASSERT_OK(Put(1, "1_key3", "val3")); - ASSERT_OK(Put(1, "9_key3", "val3")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "1_key4", "val4")); - ASSERT_OK(Put(1, "9_key4", "val4")); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(3, /* Totally 3 files created up to now */ - TestGetTickerCount(options, BLOCK_CACHE_ADD)); -} - -TEST_F(DBTest, GetPropertiesOfAllTablesTest) { - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 8; - Reopen(options); - // Create 4 tables - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); - } - db_->Flush(FlushOptions()); - } - - // 1. Read table properties directly from file - Reopen(options); - VerifyTableProperties(db_, 10 + 11 + 12 + 13); - - // 2. Put two tables to table cache and - Reopen(options); - // fetch key from 1st and 2nd table, which will internally place that table to - // the table cache. - for (int i = 0; i < 2; ++i) { - Get(ToString(i * 100 + 0)); - } - - VerifyTableProperties(db_, 10 + 11 + 12 + 13); - - // 3. Put all tables to table cache - Reopen(options); - // fetch key from 1st and 2nd table, which will internally place that table to - // the table cache. - for (int i = 0; i < 4; ++i) { - Get(ToString(i * 100 + 0)); - } - VerifyTableProperties(db_, 10 + 11 + 12 + 13); -} - -namespace { -void ResetTableProperties(TableProperties* tp) { - tp->data_size = 0; - tp->index_size = 0; - tp->filter_size = 0; - tp->raw_key_size = 0; - tp->raw_value_size = 0; - tp->num_data_blocks = 0; - tp->num_entries = 0; -} - -void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { - double dummy_double; - std::replace(tp_string.begin(), tp_string.end(), ';', ' '); - std::replace(tp_string.begin(), tp_string.end(), '=', ' '); - ResetTableProperties(tp); - - sscanf(tp_string.c_str(), "# data blocks %" SCNu64 - " # entries %" SCNu64 - " raw key size %" SCNu64 - " raw average key size %lf " - " raw value size %" SCNu64 - " raw average value size %lf " - " data block size %" SCNu64 - " index block size %" SCNu64 - " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_size, &tp->filter_size); -} - -void VerifySimilar(uint64_t a, uint64_t b, double bias) { - ASSERT_EQ(a == 0U, b == 0U); - if (a == 0) { - return; - } - double dbl_a = static_cast(a); - double dbl_b = static_cast(b); - if (dbl_a > dbl_b) { - ASSERT_LT(static_cast(dbl_a - dbl_b) / (dbl_a + dbl_b), bias); - } else { - ASSERT_LT(static_cast(dbl_b - dbl_a) / (dbl_a + dbl_b), bias); - } -} - -void VerifyTableProperties(const TableProperties& base_tp, - const TableProperties& new_tp, - double filter_size_bias = 0.1, - double index_size_bias = 0.1, - double data_size_bias = 0.1, - double num_data_blocks_bias = 0.05) { - VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); - VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); - VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); - VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks, - num_data_blocks_bias); - ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); - ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); - ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); -} - -void GetExpectedTableProperties(TableProperties* expected_tp, - const int kKeySize, const int kValueSize, - const int kKeysPerTable, const int kTableCount, - const int kBloomBitsPerKey, - const size_t kBlockSize) { - const int kKeyCount = kTableCount * kKeysPerTable; - const int kAvgSuccessorSize = kKeySize / 2; - const int kEncodingSavePerKey = kKeySize / 4; - expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); - expected_tp->raw_value_size = kKeyCount * kValueSize; - expected_tp->num_entries = kKeyCount; - expected_tp->num_data_blocks = - kTableCount * - (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / - kBlockSize; - expected_tp->data_size = - kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); - expected_tp->index_size = - expected_tp->num_data_blocks * (kAvgSuccessorSize + 12); - expected_tp->filter_size = - kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); -} -} // namespace - -TEST_F(DBTest, AggregatedTableProperties) { - for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { - const int kKeysPerTable = 100; - const int kKeySize = 80; - const int kValueSize = 200; - const int kBloomBitsPerKey = 20; - - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 8; - options.compression = kNoCompression; - options.create_if_missing = true; - - BlockBasedTableOptions table_options; - table_options.filter_policy.reset( - NewBloomFilterPolicy(kBloomBitsPerKey, false)); - table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - - DestroyAndReopen(options); - - Random rnd(5632); - for (int table = 1; table <= kTableCount; ++table) { - for (int i = 0; i < kKeysPerTable; ++i) { - db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); - } - db_->Flush(FlushOptions()); - } - std::string property; - db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); - - TableProperties expected_tp; - GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, kTableCount, kBloomBitsPerKey, - table_options.block_size); - - TableProperties output_tp; - ParseTablePropertiesString(property, &output_tp); - - VerifyTableProperties(expected_tp, output_tp); - } -} - -TEST_F(DBTest, ReadLatencyHistogramByLevel) { - Options options = CurrentOptions(); - options.write_buffer_size = 110 << 10; - options.level0_file_num_compaction_trigger = 3; - options.num_levels = 4; - options.compression = kNoCompression; - options.max_bytes_for_level_base = 450 << 10; - options.target_file_size_base = 98 << 10; - options.max_write_buffer_number = 2; - options.statistics = rocksdb::CreateDBStatistics(); - options.max_open_files = 100; - - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - - DestroyAndReopen(options); - int key_index = 0; - Random rnd(301); - for (int num = 0; num < 5; num++) { - Put("foo", "bar"); - GenerateNewFile(&rnd, &key_index); - } - - std::string prop; - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - - // Get() after flushes, See latency histogram tracked. - for (int key = 0; key < 500; key++) { - Get(Key(key)); - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - - // Reopen and issue Get(). See thee latency tracked - Reopen(options); - for (int key = 0; key < 500; key++) { - Get(Key(key)); - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - - // Reopen and issue iterating. See thee latency tracked - Reopen(options); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - { - unique_ptr iter(db_->NewIterator(ReadOptions())); - for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { - } - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - - // options.max_open_files preloads table readers. - options.max_open_files = -1; - Reopen(options); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - for (int key = 0; key < 500; key++) { - Get(Key(key)); - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); -} - -TEST_F(DBTest, AggregatedTablePropertiesAtLevel) { - const int kTableCount = 100; - const int kKeysPerTable = 10; - const int kKeySize = 50; - const int kValueSize = 400; - const int kMaxLevel = 7; - const int kBloomBitsPerKey = 20; - Random rnd(301); - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 8; - options.compression = kNoCompression; - options.create_if_missing = true; - options.level0_file_num_compaction_trigger = 2; - options.target_file_size_base = 8192; - options.max_bytes_for_level_base = 10000; - options.max_bytes_for_level_multiplier = 2; - // This ensures there no compaction happening when we call GetProperty(). - options.disable_auto_compactions = true; - - BlockBasedTableOptions table_options; - table_options.filter_policy.reset( - NewBloomFilterPolicy(kBloomBitsPerKey, false)); - table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - - DestroyAndReopen(options); - - std::string level_tp_strings[kMaxLevel]; - std::string tp_string; - TableProperties level_tps[kMaxLevel]; - TableProperties tp, sum_tp, expected_tp; - for (int table = 1; table <= kTableCount; ++table) { - for (int i = 0; i < kKeysPerTable; ++i) { - db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); - } - db_->Flush(FlushOptions()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ResetTableProperties(&sum_tp); - for (int level = 0; level < kMaxLevel; ++level) { - db_->GetProperty( - DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level), - &level_tp_strings[level]); - ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]); - sum_tp.data_size += level_tps[level].data_size; - sum_tp.index_size += level_tps[level].index_size; - sum_tp.filter_size += level_tps[level].filter_size; - sum_tp.raw_key_size += level_tps[level].raw_key_size; - sum_tp.raw_value_size += level_tps[level].raw_value_size; - sum_tp.num_data_blocks += level_tps[level].num_data_blocks; - sum_tp.num_entries += level_tps[level].num_entries; - } - db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); - ParseTablePropertiesString(tp_string, &tp); - ASSERT_EQ(sum_tp.data_size, tp.data_size); - ASSERT_EQ(sum_tp.index_size, tp.index_size); - ASSERT_EQ(sum_tp.filter_size, tp.filter_size); - ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size); - ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); - ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); - ASSERT_EQ(sum_tp.num_entries, tp.num_entries); - if (table > 3) { - GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, table, kBloomBitsPerKey, - table_options.block_size); - // Gives larger bias here as index block size, filter block size, - // and data block size become much harder to estimate in this test. - VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); - } - } -} - -class CoutingUserTblPropCollector : public TablePropertiesCollector { - public: - const char* Name() const override { return "CoutingUserTblPropCollector"; } - - Status Finish(UserCollectedProperties* properties) override { - std::string encoded; - PutVarint32(&encoded, count_); - *properties = UserCollectedProperties{ - {"CoutingUserTblPropCollector", message_}, {"Count", encoded}, - }; - return Status::OK(); - } - - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { - ++count_; - return Status::OK(); - } - - virtual UserCollectedProperties GetReadableProperties() const override { - return UserCollectedProperties{}; - } - - private: - std::string message_ = "Rocksdb"; - uint32_t count_ = 0; -}; -class CoutingUserTblPropCollectorFactory - : public TablePropertiesCollectorFactory { - public: - virtual TablePropertiesCollector* CreateTablePropertiesCollector() override { - return new CoutingUserTblPropCollector(); - } - const char* Name() const override { - return "CoutingUserTblPropCollectorFactory"; - } -}; - -TEST_F(DBTest, GetUserDefinedTablaProperties) { - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = (1<<30); - options.max_background_flushes = 0; - options.table_properties_collector_factories.resize(1); - options.table_properties_collector_factories[0] = - std::make_shared(); Reopen(options); - // Create 4 tables - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); - } - db_->Flush(FlushOptions()); - } - - TablePropertiesCollection props; - ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); - ASSERT_EQ(4U, props.size()); - uint32_t sum = 0; - for (const auto& item : props) { - auto& user_collected = item.second->user_collected_properties; - ASSERT_TRUE(user_collected.find("CoutingUserTblPropCollector") != - user_collected.end()); - ASSERT_EQ(user_collected.at("CoutingUserTblPropCollector"), "Rocksdb"); - ASSERT_TRUE(user_collected.find("Count") != user_collected.end()); - Slice key(user_collected.at("Count")); - uint32_t count; - ASSERT_TRUE(GetVarint32(&key, &count)); - sum += count; - } - ASSERT_EQ(10u + 11u + 12u + 13u, sum); + // Add a key + ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); } TEST_F(DBTest, LevelLimitReopen) { @@ -882,6 +370,7 @@ TEST_F(DBTest, LevelLimitReopen) { options.max_bytes_for_level_multiplier_additional.resize(10, 1); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } +#endif // ROCKSDB_LITE TEST_F(DBTest, PutDeleteGet) { do { @@ -911,6 +400,119 @@ TEST_F(DBTest, PutSingleDeleteGet) { kSkipUniversalCompaction | kSkipMergePut)); } +TEST_F(DBTest, ReadFromPersistedTier) { + do { + Random rnd(301); + Options options = CurrentOptions(); + for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) { + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions wopt; + wopt.disableWAL = (disableWAL == 1); + // 1st round: put but not flush + ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first")); + ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one")); + ASSERT_EQ("first", Get(1, "foo")); + ASSERT_EQ("one", Get(1, "bar")); + + // Read directly from persited data. + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + std::string value; + if (wopt.disableWAL) { + // as data has not yet being flushed, we expect not found. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + } + + // Multiget + std::vector multiget_cfs; + multiget_cfs.push_back(handles_[1]); + multiget_cfs.push_back(handles_[1]); + std::vector multiget_keys; + multiget_keys.push_back("foo"); + multiget_keys.push_back("bar"); + std::vector multiget_values; + auto statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(statuses[1].IsNotFound()); + } else { + ASSERT_OK(statuses[0]); + ASSERT_OK(statuses[1]); + } + + // 2nd round: flush and put a new value in memtable. + ASSERT_OK(Flush(1)); + ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello")); + + // once the data has been flushed, we are able to get the + // data when kPersistedTier is used. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok()); + ASSERT_EQ(value, "first"); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + if (wopt.disableWAL) { + ASSERT_TRUE( + db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value)); + ASSERT_EQ(value, "hello"); + } + + // Expect same result in multiget + multiget_cfs.push_back(handles_[1]); + multiget_keys.push_back("rocksdb"); + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ("first", multiget_values[0]); + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[2].IsNotFound()); + } else { + ASSERT_OK(statuses[2]); + } + + // 3rd round: delete and flush + ASSERT_OK(db_->Delete(wopt, handles_[1], "foo")); + Flush(1); + ASSERT_OK(db_->Delete(wopt, handles_[1], "bar")); + + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + if (wopt.disableWAL) { + // Still expect finding the value as its delete has not yet being + // flushed. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + } else { + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } + ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok()); + ASSERT_EQ(value, "hello"); + + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].IsNotFound()); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + } else { + ASSERT_TRUE(statuses[1].IsNotFound()); + } + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ("hello", multiget_values[2]); + if (wopt.disableWAL == 0) { + DestroyAndReopen(options); + } + } + } while (ChangeOptions(kSkipHashCuckoo)); +} + TEST_F(DBTest, SingleDeleteFlush) { // Test to check whether flushing preserves a single delete hidden // behind a put. @@ -996,12 +598,64 @@ TEST_F(DBTest, EmptyFlush) { kSkipUniversalCompaction | kSkipMergePut)); } +// Disable because not all platform can run it. +// It requires more than 9GB memory to run it, With single allocation +// of more than 3GB. +TEST_F(DBTest, DISABLED_VeryLargeValue) { + const size_t kValueSize = 3221225472u; // 3GB value + const size_t kKeySize = 8388608u; // 8MB key + std::string raw(kValueSize, 'v'); + std::string key1(kKeySize, 'c'); + std::string key2(kKeySize, 'd'); + + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.paranoid_checks = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("boo", "v1")); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put(key1, raw)); + raw[0] = 'w'; + ASSERT_OK(Put(key2, raw)); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + std::string value; + Status s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); + + // Compact all files. + Flush(); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Check DB is not in read-only state. + ASSERT_OK(Put("boo", "v1")); + + s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); +} + TEST_F(DBTest, GetFromImmutableLayer) { do { - Options options; + Options options = CurrentOptions(); options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "v1")); @@ -1009,8 +663,8 @@ TEST_F(DBTest, GetFromImmutableLayer) { // Block sync calls env_->delay_sstable_sync_.store(true, std::memory_order_release); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - Put(1, "k2", std::string(100000, 'y')); // Trigger flush + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(0, "foo")); // Release sync calls @@ -1028,6 +682,7 @@ TEST_F(DBTest, GetFromVersions) { } while (ChangeOptions()); } +#ifndef ROCKSDB_LITE TEST_F(DBTest, GetSnapshot) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; @@ -1053,6 +708,7 @@ TEST_F(DBTest, GetSnapshot) { } } while (ChangeOptions()); } +#endif // ROCKSDB_LITE TEST_F(DBTest, GetLevel0Ordering) { do { @@ -1080,6 +736,7 @@ TEST_F(DBTest, WrongLevel0Config) { ASSERT_OK(DB::Open(options, dbname_, &db_)); } +#ifndef ROCKSDB_LITE TEST_F(DBTest, GetOrderedByLevels) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); @@ -1151,8581 +808,4890 @@ TEST_F(DBTest, GetEncountersEmptyLevel) { ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); } +#endif // ROCKSDB_LITE -// KeyMayExist can lead to a few false positives, but not false negatives. -// To make test deterministic, use a much larger number of bits per key-20 than -// bits in the key, so that false positives are eliminated -TEST_F(DBTest, KeyMayExist) { - do { - ReadOptions ropts; - std::string value; - anon::OptionsOverride options_override; - options_override.filter_policy.reset(NewBloomFilterPolicy(20)); - Options options = CurrentOptions(options_override); - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - - ASSERT_OK(Put(1, "a", "b")); - bool value_found = false; - ASSERT_TRUE( - db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); - ASSERT_TRUE(value_found); - ASSERT_EQ("b", value); - - ASSERT_OK(Flush(1)); - value.clear(); - - long numopen = TestGetTickerCount(options, NO_FILE_OPENS); - long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE( - db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); - ASSERT_TRUE(!value_found); - // assert that no new files were opened and no new blocks were - // read into block cache. - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Delete(1, "a")); - - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow trivial move */); - - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - ASSERT_OK(Delete(1, "c")); - - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - - // KeyMayExist function only checks data in block caches, which is not used - // by plain table format. - } while ( - ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); -} - -TEST_F(DBTest, NonBlockingIteration) { +TEST_F(DBTest, CheckLock) { do { - ReadOptions non_blocking_opts, regular_opts; + DB* localdb; Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - non_blocking_opts.read_tier = kBlockCacheTier; - CreateAndReopenWithCF({"pikachu"}, options); - // write one kv to the database. - ASSERT_OK(Put(1, "a", "b")); - - // scan using non-blocking iterator. We should find it because - // it is in memtable. - Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - delete iter; - - // flush memtable to storage. Now, the key should not be in the - // memtable neither in the block cache. - ASSERT_OK(Flush(1)); + ASSERT_OK(TryReopen(options)); - // verify that a non-blocking iterator does not find any - // kvs. Neither does it do any IOs to storage. - long numopen = TestGetTickerCount(options, NO_FILE_OPENS); - long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - count++; - } - ASSERT_EQ(count, 0); - ASSERT_TRUE(iter->status().IsIncomplete()); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // read in the specified block via a regular get - ASSERT_EQ(Get(1, "a"), "b"); - - // verify that we can find it via a non-blocking scan - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // This test verifies block cache behaviors, which is not used by plain - // table format. - // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | - kSkipMmapReads)); + // second open should fail + ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); + } while (ChangeCompactOptions()); } -TEST_F(DBTest, ManagedNonBlockingIteration) { +TEST_F(DBTest, FlushMultipleMemtable) { do { - ReadOptions non_blocking_opts, regular_opts; Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - non_blocking_opts.read_tier = kBlockCacheTier; - non_blocking_opts.managed = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_number_to_maintain = -1; CreateAndReopenWithCF({"pikachu"}, options); - // write one kv to the database. - ASSERT_OK(Put(1, "a", "b")); - - // scan using non-blocking iterator. We should find it because - // it is in memtable. - Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - delete iter; - - // flush memtable to storage. Now, the key should not be in the - // memtable neither in the block cache. + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - // verify that a non-blocking iterator does not find any - // kvs. Neither does it do any IOs to storage. - int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); - int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - count++; - } - ASSERT_EQ(count, 0); - ASSERT_TRUE(iter->status().IsIncomplete()); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // read in the specified block via a regular get - ASSERT_EQ(Get(1, "a"), "b"); - - // verify that we can find it via a non-blocking scan - numopen = TestGetTickerCount(options, NO_FILE_OPENS); - cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts, handles_[1]); - count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - count++; - } - ASSERT_EQ(count, 1); - ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); - ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - delete iter; - - // This test verifies block cache behaviors, which is not used by plain - // table format. - // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | - kSkipMmapReads)); -} - -// A delete is skipped for key if KeyMayExist(key) returns False -// Tests Writebatch consistency and proper delete behaviour -TEST_F(DBTest, FilterDeletes) { - do { - anon::OptionsOverride options_override; - options_override.filter_policy.reset(NewBloomFilterPolicy(20)); - Options options = CurrentOptions(options_override); - options.filter_deletes = true; - CreateAndReopenWithCF({"pikachu"}, options); - WriteBatch batch; - - batch.Delete(handles_[1], "a"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(AllEntriesFor("a", 1), "[ ]"); // Delete skipped - batch.Clear(); - - batch.Put(handles_[1], "a", "b"); - batch.Delete(handles_[1], "a"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(Get(1, "a"), "NOT_FOUND"); - ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]"); // Delete issued - batch.Clear(); - - batch.Delete(handles_[1], "c"); - batch.Put(handles_[1], "c", "d"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(Get(1, "c"), "d"); - ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]"); // Delete skipped - batch.Clear(); - - ASSERT_OK(Flush(1)); // A stray Flush - - batch.Delete(handles_[1], "c"); - dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]"); // Delete issued - batch.Clear(); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); } while (ChangeCompactOptions()); } -TEST_F(DBTest, GetFilterByPrefixBloom) { - Options options = last_options_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; - - ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); - ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); - ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - - dbfull()->Flush(fo); - - ASSERT_EQ("foo", Get("barbarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("foo2", Get("barbarbar2")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +TEST_F(DBTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); - ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 1; + CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); -} + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); -TEST_F(DBTest, WholeKeyFilterProp) { - Options options = last_options_; - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - options.statistics = rocksdb::CreateDBStatistics(); + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); - WriteOptions wo; - ReadOptions ro; - FlushOptions fo; - fo.wait = true; - std::string value; + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - dbfull()->Flush(fo); + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); - Reopen(options); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - - // Reopen with whole key filtering enabled and prefix extractor - // NULL. Bloom filter should be off for both of whole key and - // prefix bloom. - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - options.prefix_extractor.reset(); - Reopen(options); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - // Write DB with only full key filtering. - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); +TEST_F(DBTest, FLUSH) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime); + ; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + // this will now also flush the last 2 writes + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - // Reopen with both of whole key off and prefix extractor enabled. - // Still no bloom filter should be used. - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); + perf_context.Reset(); + Get(1, "foo"); + ASSERT_TRUE((int)perf_context.get_from_output_files_time > 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - - // Try to create a DB with mixed files: - ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); - // Needs insert some keys to make sure files are not filtered out by key - // ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); - options.prefix_extractor.reset(); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + ASSERT_OK(Flush(1)); - // Try to create a DB with mixed files. - ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); - // In this case needs insert some keys to make sure files are - // not filtered out by key ranges. - ASSERT_OK(dbfull()->Put(wo, "aaa", "")); - ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - Flush(); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v2", Get(1, "bar")); + perf_context.Reset(); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_TRUE((int)perf_context.get_from_output_files_time > 0); - // Now we have two files: - // File 1: An older file with prefix bloom. - // File 2: A newer file with whole bloom filter. - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); - - // Reopen with the same setting: only whole key is used - Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); - - // Restart with both filters are allowed - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); - // File 1 will has it filtered out. - // File 2 will not, as prefix `foo` exists in the file. - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - - // Restart with only prefix bloom is allowed. - options.prefix_extractor.reset(NewFixedPrefixTransform(3)); - bbto.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); - ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); - ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); - ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); -} + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + ASSERT_OK(Flush(1)); -TEST_F(DBTest, IterSeekBeforePrev) { - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("0", "f")); - ASSERT_OK(Put("1", "h")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("2", "j")); - auto iter = db_->NewIterator(ReadOptions()); - iter->Seek(Slice("c")); - iter->Prev(); - iter->Seek(Slice("a")); - iter->Prev(); - delete iter; -} + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); -namespace { -std::string MakeLongKey(size_t length, char c) { - return std::string(length, c); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); } -} // namespace - -TEST_F(DBTest, IterLongKeys) { - ASSERT_OK(Put(MakeLongKey(20, 0), "0")); - ASSERT_OK(Put(MakeLongKey(32, 2), "2")); - ASSERT_OK(Put("a", "b")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put(MakeLongKey(50, 1), "1")); - ASSERT_OK(Put(MakeLongKey(127, 3), "3")); - ASSERT_OK(Put(MakeLongKey(64, 4), "4")); - auto iter = db_->NewIterator(ReadOptions()); - - // Create a key that needs to be skipped for Seq too new - iter->Seek(MakeLongKey(20, 0)); - ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); - delete iter; - iter = db_->NewIterator(ReadOptions()); - iter->Seek(MakeLongKey(50, 1)); - ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); - delete iter; -} +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushSchedule) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 1; + options.max_write_buffer_number = 2; + options.write_buffer_size = 120 * 1024; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; -TEST_F(DBTest, IterNextWithNewerSeq) { - ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); - auto iter = db_->NewIterator(ReadOptions()); + std::atomic thread_num(0); + // each column family will have 5 thread, each thread generating 2 memtables. + // each column family should end up with 10 table files + std::function fill_memtable_func = [&]() { + int a = thread_num.fetch_add(1); + Random rnd(a); + WriteOptions wo; + // this should fill up 2 memtables + for (int k = 0; k < 5000; ++k) { + ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + } + }; - // Create a key that needs to be skipped for Seq too new - for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; - i++) { - ASSERT_OK(Put("b", "f")); + for (int i = 0; i < 10; ++i) { + threads.emplace_back(fill_memtable_func); } - iter->Seek(Slice("a")); - ASSERT_EQ(IterStatus(iter), "a->b"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->d"); - delete iter; -} - -TEST_F(DBTest, IterPrevWithNewerSeq) { - ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); - auto iter = db_->NewIterator(ReadOptions()); - - // Create a key that needs to be skipped for Seq too new - for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; - i++) { - ASSERT_OK(Put("b", "f")); + for (auto& t : threads) { + t.join(); } - iter->Seek(Slice("d")); - ASSERT_EQ(IterStatus(iter), "d->e"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->d"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->b"); - - iter->Prev(); - delete iter; + auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); + auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + ASSERT_LE(default_tables, static_cast(10)); + ASSERT_GT(default_tables, static_cast(0)); + ASSERT_LE(pikachu_tables, static_cast(10)); + ASSERT_GT(pikachu_tables, static_cast(0)); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, IterPrevWithNewerSeq2) { - ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); - auto iter = db_->NewIterator(ReadOptions()); - iter->Seek(Slice("c")); - ASSERT_EQ(IterStatus(iter), "c->d"); - - // Create a key that needs to be skipped for Seq too new - for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; - i++) { - ASSERT_OK(Put("b", "f")); - } - - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->b"); - - iter->Prev(); - delete iter; -} - -TEST_F(DBTest, IterEmpty) { +TEST_F(DBTest, ManifestRollOver) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("foo"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; + Options options; + options.max_manifest_file_size = 10; // 10 bytes + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); + } } while (ChangeCompactOptions()); } -TEST_F(DBTest, IterSingle) { +TEST_F(DBTest, IdentityAcrossRestarts) { do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); + Options options = CurrentOptions(); + Reopen(options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); - delete iter; + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); } while (ChangeCompactOptions()); } -TEST_F(DBTest, IterMulti) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "a", "va")); - ASSERT_OK(Put(1, "b", "vb")); - ASSERT_OK(Put(1, "c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); +namespace { +class KeepFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, + bool* value_changed) const override { + return false; + } - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("ax"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("z"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - // Switch from reverse to forward - iter->SeekToLast(); - iter->Prev(); - iter->Prev(); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); + virtual const char* Name() const override { return "KeepFilter"; } +}; - // Switch from forward to reverse - iter->SeekToFirst(); - iter->Next(); - iter->Next(); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Make sure iter stays at snapshot - ASSERT_OK(Put(1, "a", "va2")); - ASSERT_OK(Put(1, "a2", "va3")); - ASSERT_OK(Put(1, "b", "vb2")); - ASSERT_OK(Put(1, "c", "vc2")); - ASSERT_OK(Delete(1, "b")); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; - } while (ChangeCompactOptions()); -} +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} -// Check that we can skip over a run of user keys -// by using reseek rather than sequential scan -TEST_F(DBTest, IterReseek) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - Options options = CurrentOptions(options_override); - options.max_sequential_skip_in_iterations = 3; - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } - // insert two keys with same userkey and verify that - // reseek is not invoked. For each of these test cases, - // verify that we can find the next key "b". - ASSERT_OK(Put(1, "a", "one")); - ASSERT_OK(Put(1, "a", "two")); - ASSERT_OK(Put(1, "b", "bone")); - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - ASSERT_EQ(IterStatus(iter), "a->two"); - iter->Next(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - ASSERT_EQ(IterStatus(iter), "b->bone"); - delete iter; + virtual const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; - // insert a total of three keys with same userkey and verify - // that reseek is still not invoked. - ASSERT_OK(Put(1, "a", "three")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->three"); - iter->Next(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - ASSERT_EQ(IterStatus(iter), "b->bone"); - delete iter; +class DelayFilter : public CompactionFilter { + public: + explicit DelayFilter(DBTestBase* d) : db_test(d) {} + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, + bool* value_changed) const override { + db_test->env_->addon_time_.fetch_add(1000); + return true; + } - // insert a total of four keys with same userkey and verify - // that reseek is invoked. - ASSERT_OK(Put(1, "a", "four")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->four"); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); - iter->Next(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); - ASSERT_EQ(IterStatus(iter), "b->bone"); - delete iter; + virtual const char* Name() const override { return "DelayFilter"; } - // Testing reverse iterator - // At this point, we have three versions of "a" and one version of "b". - // The reseek statistics is already at 1. - int num_reseeks = - (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION); - - // Insert another version of b and assert that reseek is not invoked - ASSERT_OK(Put(1, "b", "btwo")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "b->btwo"); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks); - iter->Prev(); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks + 1); - ASSERT_EQ(IterStatus(iter), "a->four"); - delete iter; + private: + DBTestBase* db_test; +}; - // insert two more versions of b. This makes a total of 4 versions - // of b and 4 versions of a. - ASSERT_OK(Put(1, "b", "bthree")); - ASSERT_OK(Put(1, "b", "bfour")); - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "b->bfour"); - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks + 2); - iter->Prev(); - - // the previous Prev call should have invoked reseek - ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), - num_reseeks + 3); - ASSERT_EQ(IterStatus(iter), "a->four"); - delete iter; -} +class DelayFilterFactory : public CompactionFilterFactory { + public: + explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr(new DelayFilter(db_test)); + } -TEST_F(DBTest, IterSmallAndLargeMix) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "a", "va")); - ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); - ASSERT_OK(Put(1, "c", "vc")); - ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); - ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + virtual const char* Name() const override { return "DelayFilterFactory"; } - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + private: + DBTestBase* db_test; +}; +} // namespace - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; - } while (ChangeCompactOptions()); -} +#ifndef ROCKSDB_LITE -TEST_F(DBTest, IterMultiWithDelete) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "ka", "va")); - ASSERT_OK(Put(1, "kb", "vb")); - ASSERT_OK(Put(1, "kc", "vc")); - ASSERT_OK(Delete(1, "kb")); - ASSERT_EQ("NOT_FOUND", Get(1, "kb")); - - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->Seek("kc"); - ASSERT_EQ(IterStatus(iter), "kc->vc"); - if (!CurrentOptions().merge_operator) { - // TODO: merge operator does not support backward iteration yet - if (kPlainTableAllBytesPrefix != option_config_&& - kBlockBasedTableWithWholeKeyHashIndex != option_config_ && - kHashLinkList != option_config_) { - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "ka->va"); - } - } - delete iter; - } while (ChangeOptions()); +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; } +#endif // ROCKSDB_LITE -TEST_F(DBTest, IterPrevMaxSkip) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - for (int i = 0; i < 2; i++) { - ASSERT_OK(Put(1, "key1", "v1")); - ASSERT_OK(Put(1, "key2", "v2")); - ASSERT_OK(Put(1, "key3", "v3")); - ASSERT_OK(Put(1, "key4", "v4")); - ASSERT_OK(Put(1, "key5", "v5")); - } - - VerifyIterLast("key5->v5", 1); - - ASSERT_OK(Delete(1, "key5")); - VerifyIterLast("key4->v4", 1); - - ASSERT_OK(Delete(1, "key4")); - VerifyIterLast("key3->v3", 1); - - ASSERT_OK(Delete(1, "key3")); - VerifyIterLast("key2->v2", 1); - - ASSERT_OK(Delete(1, "key2")); - VerifyIterLast("key1->v1", 1); - - ASSERT_OK(Delete(1, "key1")); - VerifyIterLast("(invalid)", 1); - } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); +TEST_F(DBTest, FailMoreDbPaths) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 10000000); + options.db_paths.emplace_back(dbname_ + "_2", 1000000); + options.db_paths.emplace_back(dbname_ + "_3", 1000000); + options.db_paths.emplace_back(dbname_ + "_4", 1000000); + options.db_paths.emplace_back(dbname_ + "_5", 1000000); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); } -TEST_F(DBTest, IterWithSnapshot) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); - ASSERT_OK(Put(1, "key1", "val1")); - ASSERT_OK(Put(1, "key2", "val2")); - ASSERT_OK(Put(1, "key3", "val3")); - ASSERT_OK(Put(1, "key4", "val4")); - ASSERT_OK(Put(1, "key5", "val5")); - - const Snapshot *snapshot = db_->GetSnapshot(); - ReadOptions options; - options.snapshot = snapshot; - Iterator* iter = db_->NewIterator(options, handles_[1]); - - // Put more values after the snapshot - ASSERT_OK(Put(1, "key100", "val100")); - ASSERT_OK(Put(1, "key101", "val101")); - - iter->Seek("key5"); - ASSERT_EQ(IterStatus(iter), "key5->val5"); - if (!CurrentOptions().merge_operator) { - // TODO: merge operator does not support backward iteration yet - if (kPlainTableAllBytesPrefix != option_config_&& - kBlockBasedTableWithWholeKeyHashIndex != option_config_ && - kHashLinkList != option_config_) { - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "key4->val4"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "key3->val3"); - - iter->Next(); - ASSERT_EQ(IterStatus(iter), "key4->val4"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "key5->val5"); - } - iter->Next(); - ASSERT_TRUE(!iter->Valid()); +void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { + uint64_t cf_size = 0; + uint64_t cf_csize = 0; + size_t file_count = 0; + for (auto level_meta : cf_meta.levels) { + uint64_t level_size = 0; + uint64_t level_csize = 0; + file_count += level_meta.files.size(); + for (auto file_meta : level_meta.files) { + level_size += file_meta.size; } - db_->ReleaseSnapshot(snapshot); - delete iter; - // skip as HashCuckooRep does not support snapshot - } while (ChangeOptions(kSkipHashCuckoo)); + ASSERT_EQ(level_meta.size, level_size); + cf_size += level_size; + cf_csize += level_csize; + } + ASSERT_EQ(cf_meta.file_count, file_count); + ASSERT_EQ(cf_meta.size, cf_size); } -TEST_F(DBTest, Recover) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_OK(Put(1, "baz", "v5")); - - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v1", Get(1, "foo")); - - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v5", Get(1, "baz")); - ASSERT_OK(Put(1, "bar", "v2")); - ASSERT_OK(Put(1, "foo", "v3")); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, ColumnFamilyMetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v3", Get(1, "foo")); - ASSERT_OK(Put(1, "foo", "v4")); - ASSERT_EQ("v4", Get(1, "foo")); - ASSERT_EQ("v2", Get(1, "bar")); - ASSERT_EQ("v5", Get(1, "baz")); - } while (ChangeOptions()); + Random rnd(301); + int key_index = 0; + ColumnFamilyMetaData cf_meta; + for (int i = 0; i < 100; ++i) { + GenerateNewFile(&rnd, &key_index); + db_->GetColumnFamilyMetaData(&cf_meta); + CheckColumnFamilyMeta(cf_meta); + } } -TEST_F(DBTest, RecoverWithTableHandle) { - do { - Options options; - options.create_if_missing = true; - options.write_buffer_size = 100; - options.disable_auto_compactions = true; - options = CurrentOptions(options); - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_OK(Put(1, "bar", "v2")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "foo", "v3")); - ASSERT_OK(Put(1, "bar", "v4")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "big", std::string(100, 'a'))); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); +namespace { +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); - std::vector> files; - dbfull()->TEST_GetFilesMetaData(handles_[1], &files); - int total_files = 0; - for (const auto& level : files) { - total_files += level.size(); - } - ASSERT_EQ(total_files, 3); - for (const auto& level : files) { - for (const auto& file : level) { - if (kInfiniteMaxOpenFiles == option_config_) { - ASSERT_TRUE(file.table_reader_handle != nullptr); - } else { - ASSERT_TRUE(file.table_reader_handle == nullptr); - } - } + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(DBTestBase::RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); } - } while (ChangeOptions()); -} - -TEST_F(DBTest, IgnoreRecoveredLog) { - std::string backup_logs = dbname_ + "/backup_logs"; + self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } - // delete old files in backup_logs directory - env_->CreateDirIfMissing(backup_logs); - std::vector old_files; - env_->GetChildren(backup_logs, &old_files); - for (auto& file : old_files) { - if (file != "." && file != "..") { - env_->DeleteFile(backup_logs + "/" + file); - } + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(DBTestBase::RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); } + self->dbfull()->TEST_WaitForCompact(); - do { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.merge_operator = MergeOperators::CreateUInt64AddOperator(); - options.wal_dir = dbname_ + "/logs"; - DestroyAndReopen(options); + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); +} - // fill up the DB - std::string one, two; - PutFixed64(&one, 1); - PutFixed64(&two, 2); - ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); - ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); - ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); - - // copy the logs to backup - std::vector logs; - env_->GetChildren(options.wal_dir, &logs); - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); - } - } +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, + "Test with compression options : window_bits = %d, level = %d, " + "strategy = %d}\n", + wbits, lev, strategy); + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; - // recover the DB - Reopen(options); - ASSERT_EQ(two, Get("foo")); - ASSERT_EQ(one, Get("bar")); - Close(); - - // copy the logs from backup back to wal dir - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } - } - // this should ignore the log files, recovery should not happen again - // if the recovery happens, the same merge operator would be called twice, - // leading to incorrect results - Reopen(options); - ASSERT_EQ(two, Get("foo")); - ASSERT_EQ(one, Get("bar")); - Close(); - Destroy(options); - Reopen(options); - Close(); + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (XPRESS_Supported()) { + type = kXpressCompression; + fprintf(stderr, "using xpress\n"); + } else if (ZSTD_Supported()) { + type = kZSTDNotFinalCompression; + fprintf(stderr, "using ZSTD\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); - // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } - } - // assert that we successfully recovered only from logs, even though we - // destroyed the DB - Reopen(options); - ASSERT_EQ(two, Get("foo")); - ASSERT_EQ(one, Get("bar")); + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; +} +} // namespace - // Recovery will fail if DB directory doesn't exist. - Destroy(options); - // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); - for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - // we won't be needing this file no more - env_->DeleteFile(backup_logs + "/" + log); - } - } - Status s = TryReopen(options); - ASSERT_TRUE(!s.ok()); - } while (ChangeOptions(kSkipHashCuckoo)); +TEST_F(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); } -TEST_F(DBTest, CheckLock) { - do { - DB* localdb; - Options options = CurrentOptions(); - ASSERT_OK(TryReopen(options)); +TEST_F(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); - // second open should fail - ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); - } while (ChangeCompactOptions()); + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); } -TEST_F(DBTest, FlushMultipleMemtable) { +TEST_F(DBTest, RepeatedWritesToSameKey) { do { Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = -1; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); - ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); - ASSERT_OK(Flush(1)); + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = + options.num_levels + options.level0_stop_writes_trigger; + + Random rnd(301); + std::string value = + RandomString(&rnd, static_cast(2 * options.write_buffer_size)); + for (int i = 0; i < 5 * kMaxFiles; i++) { + ASSERT_OK(Put(1, "key", value)); + ASSERT_LE(TotalTableFiles(1), kMaxFiles); + } } while (ChangeCompactOptions()); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, NumImmutableMemTable) { +TEST_F(DBTest, SparseMerge) { do { Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = 0; - options.write_buffer_size = 1000000; + options.compression = kNoCompression; CreateAndReopenWithCF({"pikachu"}, options); - std::string big_value(1000000 * 2, 'x'); - std::string num; - SetPerfLevel(kEnableTime);; - ASSERT_TRUE(GetPerfLevel() == kEnableTime); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ(num, "1"); - perf_context.Reset(); - Get(1, "k1"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); - ASSERT_EQ(num, "1"); + FillLevels("A", "Z", 1); - perf_context.Reset(); - Get(1, "k1"); - ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get(1, "k2"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.cur-size-active-mem-table", &num)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "2"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); - ASSERT_EQ(num, "2"); - perf_context.Reset(); - Get(1, "k2"); - ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get(1, "k3"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get(1, "k1"); - ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put(1, "A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + Put(1, "C", "vc"); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + // Make sparse update + Put(1, "A", "va2"); + Put(1, "B100", "bvalue2"); + Put(1, "C", "vc2"); ASSERT_OK(Flush(1)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.cur-size-active-mem-table", &num)); - // "200" is the size of the metadata of an empty skiplist, this would - // break if we change the default skiplist implementation - ASSERT_EQ(num, "200"); - - uint64_t int_num; - uint64_t base_total_size; - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.estimate-num-keys", &base_total_size)); - - ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", "")); - ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3")); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num)); - ASSERT_EQ(int_num, 2U); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &int_num)); - ASSERT_EQ(int_num, 3U); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num)); - ASSERT_EQ(int_num, 4U); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num)); - ASSERT_EQ(int_num, 2U); - - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.estimate-num-keys", &int_num)); - ASSERT_EQ(int_num, base_total_size + 1); - SetPerfLevel(kDisable); - ASSERT_TRUE(GetPerfLevel() == kDisable); + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); } while (ChangeCompactOptions()); } -TEST_F(DBTest, FlushEmptyColumnFamily) { - // Block flush thread and disable compaction thread - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); +#ifndef ROCKSDB_LITE +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} +TEST_F(DBTest, ApproximateSizesMemTable) { Options options = CurrentOptions(); - // disable compaction - options.disable_auto_compactions = true; - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 2; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; - CreateAndReopenWithCF({"pikachu"}, options); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); - // Compaction can still go through even if no thread can flush the - // mem table. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } - // Insert can go through - ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + // Zero if not including mem table + db_->GetApproximateSizes(&r, 1, &size, false); + ASSERT_EQ(size, 0); - ASSERT_EQ("v1", Get(0, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + } - // Flush can still go through. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); -} + start = Key(100); + end = Key(1020); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); -TEST_F(DBTest, GetProperty) { - // Set sizes to both background thread pool to be 1 and block them. - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); - - Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.compaction_style = kCompactionStyleUniversal; - options.level0_file_num_compaction_trigger = 1; - options.compaction_options_universal.size_ratio = 50; - options.max_background_compactions = 1; - options.max_background_flushes = 1; - options.max_write_buffer_number = 10; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; - options.write_buffer_size = 1000000; - Reopen(options); + options.max_write_buffer_number = 8; + options.min_write_buffer_number_to_merge = 5; + options.write_buffer_size = 1024 * N; // Not very large + DestroyAndReopen(options); - std::string big_value(1000000 * 2, 'x'); - std::string num; - uint64_t int_num; - SetPerfLevel(kEnableTime); + int keys[N * 3]; + for (int i = 0; i < N; i++) { + keys[i * 3] = i * 5; + keys[i * 3 + 1] = i * 5 + 1; + keys[i * 3 + 2] = i * 5 + 2; + } + std::random_shuffle(std::begin(keys), std::end(keys)); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num)); - ASSERT_EQ(int_num, 0U); - - ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); - ASSERT_EQ(num, "1"); - perf_context.Reset(); - - ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing")); - ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "2"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); - ASSERT_EQ(num, "2"); - // Verify the same set of properties through GetIntProperty - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num)); - ASSERT_EQ(int_num, 2U); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num)); - ASSERT_EQ(int_num, 1U); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num)); - ASSERT_EQ(int_num, 0U); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); - ASSERT_EQ(int_num, 2U); - - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); + for (int i = 0; i < N * 3; i++) { + ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); + } - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); - dbfull()->TEST_WaitForFlushMemTable(); + start = Key(100); + end = Key(300); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); - ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); - ASSERT_EQ(num, "4"); + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_GT(size, 6000); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_GT(int_num, 0U); + start = Key(2100); + end = Key(2300); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size, true); + ASSERT_EQ(size, 0); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + start = Key(1050); + end = Key(1080); + r = Range(start, end); + uint64_t size_with_mt, size_without_mt; + db_->GetApproximateSizes(&r, 1, &size_with_mt, true); + ASSERT_GT(size_with_mt, 6000); + db_->GetApproximateSizes(&r, 1, &size_without_mt, false); + ASSERT_EQ(size_without_mt, 0); - // Wait for compaction to be done. This is important because otherwise RocksDB - // might schedule a compaction when reopening the database, failing assertion - // (A) as a result. - dbfull()->TEST_WaitForCompact(); - options.max_open_files = 10; - Reopen(options); - // After reopening, no table reader is loaded, so no memory for table readers - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); // (A) - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); - ASSERT_GT(int_num, 0U); - - // After reading a key, at least one table reader is loaded. - Get("k5"); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_GT(int_num, 0U); - - // Test rocksdb.num-live-versions - { - options.level0_file_num_compaction_trigger = 20; - Reopen(options); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 1U); + Flush(); - // Use an iterator to hold current version - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); + } - ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); - Flush(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 2U); + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(&r, 1, &size_with_mt, true); + db_->GetApproximateSizes(&r, 1, &size_without_mt, false); + ASSERT_GT(size_with_mt, size_without_mt); + ASSERT_GT(size_without_mt, 6000); +} - // Use an iterator to hold current version - std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); +TEST_F(DBTest, ApproximateSizes) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); - Flush(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 3U); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); - iter2.reset(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 2U); + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); + } - iter1.reset(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 1U); - } -} + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); -TEST_F(DBTest, ApproximateMemoryUsage) { - const int kNumRounds = 10; - // TODO(noetzli) kFlushesPerRound does not really correlate with how many - // flushes happen. - const int kFlushesPerRound = 10; - const int kWritesPerFlush = 10; - const int kKeySize = 100; - const int kValueSize = 1000; - Options options; - options.write_buffer_size = 1000; // small write buffer - options.min_write_buffer_number_to_merge = 4; - options.compression = kNoCompression; - options.create_if_missing = true; - options = CurrentOptions(options); - DestroyAndReopen(options); + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); - Random rnd(301); + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); + ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), + S2 * (i + 1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); + } + ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); + ASSERT_TRUE( + Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); - std::vector iters; - - uint64_t active_mem; - uint64_t unflushed_mem; - uint64_t all_mem; - uint64_t prev_all_mem; - - // Phase 0. The verify the initial value of all these properties are the same - // as we have no mem-tables. - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_EQ(all_mem, active_mem); - ASSERT_EQ(all_mem, unflushed_mem); - - // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to - // "size-all-mem-tables" - for (int r = 0; r < kNumRounds; ++r) { - for (int f = 0; f < kFlushesPerRound; ++f) { - for (int w = 0; w < kWritesPerFlush; ++w) { - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); - } - } - // Make sure that there is no flush between getting the two properties. - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - // in no iterator case, these two number should be the same. - ASSERT_EQ(unflushed_mem, all_mem); - } - prev_all_mem = all_mem; - - // Phase 2. Keep issuing Put() but also create new iterators. This time we - // expect "size-all-mem-tables" > "cur-size-all-mem-tables". - for (int r = 0; r < kNumRounds; ++r) { - iters.push_back(db_->NewIterator(ReadOptions())); - for (int f = 0; f < kFlushesPerRound; ++f) { - for (int w = 0; w < kWritesPerFlush; ++w) { - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); } - } - // Force flush to prevent flush from happening between getting the - // properties or after getting the properties and before the new round. - Flush(); - // In the second round, add iterators. - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_GT(all_mem, active_mem); - ASSERT_GT(all_mem, unflushed_mem); - ASSERT_GT(all_mem, prev_all_mem); - prev_all_mem = all_mem; - } - - // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks - // whenever we release an iterator. - for (auto* iter : iters) { - delete iter; - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - // Expect the size shrinking - ASSERT_LT(all_mem, prev_all_mem); - prev_all_mem = all_mem; - } - - // Expect all these three counters to be the same. - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_EQ(active_mem, unflushed_mem); - ASSERT_EQ(unflushed_mem, all_mem); - - // Phase 5. Reopen, and expect all these three counters to be the same again. - Reopen(options); - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_EQ(active_mem, unflushed_mem); - ASSERT_EQ(unflushed_mem, all_mem); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashIndex)); } -TEST_F(DBTest, EstimatePendingCompBytes) { - // Set sizes to both background thread pool to be 1 and block them. - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - - Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.compaction_style = kCompactionStyleLevel; - options.level0_file_num_compaction_trigger = 2; - options.max_background_compactions = 1; - options.max_background_flushes = 1; - options.max_write_buffer_number = 10; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; - options.write_buffer_size = 1000000; - Reopen(options); - - std::string big_value(1000000 * 2, 'x'); - std::string num; - uint64_t int_num; +TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); - Flush(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_EQ(int_num, 0U); + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(2), big1)); + ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(4), big1)); + ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); - ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); - Flush(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_EQ(int_num, 0U); + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); - Flush(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_GT(int_num, 0U); + ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); - dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_EQ(int_num, 0U); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, FLUSH) { +#ifndef ROCKSDB_LITE +TEST_F(DBTest, Snapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - SetPerfLevel(kEnableTime);; - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); - // this will now also flush the last 2 writes - ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + Put(0, "foo", "0v1"); + Put(1, "foo", "1v1"); - perf_context.Reset(); - Get(1, "foo"); - ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); + Put(0, "foo", "0v2"); + Put(1, "foo", "1v2"); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); + env_->addon_time_.fetch_add(1); - writeOpt.disableWAL = true; - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); - ASSERT_OK(Flush(1)); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + Put(0, "foo", "0v3"); + Put(1, "foo", "1v3"); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v2", Get(1, "bar")); - perf_context.Reset(); - ASSERT_EQ("v2", Get(1, "foo")); - ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + { + ManagedSnapshot s3(db_); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); - writeOpt.disableWAL = false; - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); - ASSERT_OK(Flush(1)); + Put(0, "foo", "0v4"); + Put(1, "foo", "1v4"); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); + ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - // 'foo' should be there because its put - // has WAL enabled. - ASSERT_EQ("v3", Get(1, "foo")); - ASSERT_EQ("v3", Get(1, "bar")); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); - SetPerfLevel(kDisable); - } while (ChangeCompactOptions()); -} + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); -TEST_F(DBTest, RecoveryWithEmptyLog) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_OK(Put(1, "foo", "v2")); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v3")); - ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - ASSERT_EQ("v3", Get(1, "foo")); - } while (ChangeOptions()); + db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions(kSkipHashCuckoo)); } +TEST_F(DBTest, HiddenValuesAreRemoved) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); -TEST_F(DBTest, FlushSchedule) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; - options.max_write_buffer_number = 2; - options.write_buffer_size = 120 * 1024; - CreateAndReopenWithCF({"pikachu"}, options); - std::vector threads; - - std::atomic thread_num(0); - // each column family will have 5 thread, each thread generating 2 memtables. - // each column family should end up with 10 table files - std::function fill_memtable_func = [&]() { - int a = thread_num.fetch_add(1); - Random rnd(a); - WriteOptions wo; - // this should fill up 2 memtables - for (int k = 0; k < 5000; ++k) { - ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); - } - }; + std::string big = RandomString(&rnd, 50000); + Put(1, "foo", big); + Put(1, "pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "tiny"); + Put(1, "pastfoo2", "v2"); // Advance sequence number one more - for (int i = 0; i < 10; ++i) { - threads.emplace_back(fill_memtable_func); - } + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); - for (auto& t : threads) { - t.join(); - } + ASSERT_EQ(big, Get(1, "foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); - auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); - ASSERT_LE(default_tables, static_cast(10)); - ASSERT_GT(default_tables, static_cast(0)); - ASSERT_LE(pikachu_tables, static_cast(10)); - ASSERT_GT(pikachu_tables, static_cast(0)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + // skip HashCuckooRep as it does not support snapshot + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashCuckoo)); } +#endif // ROCKSDB_LITE - -TEST_F(DBTest, ManifestRollOver) { +TEST_F(DBTest, CompactBetweenSnapshots) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - Options options; - options.max_manifest_file_size = 10 ; // 10 bytes - options = CurrentOptions(options); + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - { - ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); - ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); - ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); - uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); - ASSERT_OK(Flush(1)); // This should trigger LogAndApply. - uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); - ASSERT_GT(manifest_after_flush, manifest_before_flush); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); - // check if a new manifest file got inserted or not. - ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); - ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); - ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); - } - } while (ChangeCompactOptions()); -} + Random rnd(301); + FillLevels("a", "z", 1); -TEST_F(DBTest, IdentityAcrossRestarts) { - do { - std::string id1; - ASSERT_OK(db_->GetDbIdentity(id1)); + Put(1, "foo", "first"); + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "second"); + Put(1, "foo", "third"); + Put(1, "foo", "fourth"); + const Snapshot* snapshot2 = db_->GetSnapshot(); + Put(1, "foo", "fifth"); + Put(1, "foo", "sixth"); - Options options = CurrentOptions(); - Reopen(options); - std::string id2; - ASSERT_OK(db_->GetDbIdentity(id2)); - // id1 should match id2 because identity was not regenerated - ASSERT_EQ(id1.compare(id2), 0); + // All entries (including duplicates) exist + // before any compaction or flush is triggered. + ASSERT_EQ(AllEntriesFor("foo", 1), + "[ sixth, fifth, fourth, third, second, first ]"); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); - std::string idfilename = IdentityFileName(dbname_); - ASSERT_OK(env_->DeleteFile(idfilename)); - Reopen(options); - std::string id3; - ASSERT_OK(db_->GetDbIdentity(id3)); - // id1 should NOT match id3 because identity was regenerated - ASSERT_NE(id1.compare(id3), 0); - } while (ChangeCompactOptions()); -} + // After a flush, "second", "third" and "fifth" should + // be removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); -TEST_F(DBTest, RecoverWithLargeLog) { - do { - { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); - ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); - ASSERT_OK(Put(1, "small3", std::string(10, '3'))); - ASSERT_OK(Put(1, "small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - } + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z", 1); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); - // Make sure that if we re-open with a small write buffer size that - // we flush table files in the middle of a large log file. - Options options; - options.write_buffer_size = 100000; - options = CurrentOptions(options); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); - ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); - ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); - ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); - ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); - ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); - } while (ChangeCompactOptions()); + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z", 1); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); + // skip HashCuckooRep as it does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction)); } -namespace { -class KeepFilter : public CompactionFilter { - public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, bool* value_changed) const - override { - return false; - } +TEST_F(DBTest, UnremovableSingleDelete) { + // If we compact: + // + // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2) + // + // We do not want to end up with: + // + // Put(A, v1) Snapshot Put(A, v2) + // + // Because a subsequent SingleDelete(A) would delete the Put(A, v2) + // but not Put(A, v1), so Get(A) would return v1. + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); - virtual const char* Name() const override { return "KeepFilter"; } -}; + Put(1, "foo", "first"); + const Snapshot* snapshot = db_->GetSnapshot(); + SingleDelete(1, "foo"); + Put(1, "foo", "second"); + ASSERT_OK(Flush(1)); -class KeepFilterFactory : public CompactionFilterFactory { - public: - explicit KeepFilterFactory(bool check_context = false) - : check_context_(check_context) {} + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("second", Get(1, "foo")); - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - if (check_context_) { - EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); - EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); - } - return std::unique_ptr(new KeepFilter()); - } + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); - virtual const char* Name() const override { return "KeepFilterFactory"; } - bool check_context_; - std::atomic_bool expect_full_compaction_; - std::atomic_bool expect_manual_compaction_; -}; + SingleDelete(1, "foo"); -class DelayFilter : public CompactionFilter { - public: - explicit DelayFilter(DBTestBase* d) : db_test(d) {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { - db_test->env_->addon_time_.fetch_add(1000); - return true; - } + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - virtual const char* Name() const override { return "DelayFilter"; } + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); - private: - DBTestBase* db_test; -}; + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + db_->ReleaseSnapshot(snapshot); + // Skip HashCuckooRep as it does not support single delete. FIFO and + // universal compaction do not apply to the test case. Skip MergePut + // because single delete does not get removed when it encounters a merge. + } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction | + kSkipUniversalCompaction | kSkipMergePut)); +} -class DelayFilterFactory : public CompactionFilterFactory { - public: - explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - return std::unique_ptr(new DelayFilter(db_test)); - } +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DeletionMarkers1) { + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - virtual const char* Name() const override { return "DelayFilterFactory"; } + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - private: - DBTestBase* db_test; -}; -} // namespace + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + Slice z("z"); + dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); +} -TEST_F(DBTest, CompressedCache) { - if (!Snappy_Supported()) { - return; - } - int num_iter = 80; +TEST_F(DBTest, DeletionMarkers2) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - // Run this test three iterations. - // Iteration 1: only a uncompressed block cache - // Iteration 2: only a compressed block cache - // Iteration 3: both block cache and compressed cache - // Iteration 4: both block cache and compressed cache, but DB is not - // compressed - for (int iter = 0; iter < 4; iter++) { - Options options; - options.write_buffer_size = 64*1024; // small write buffer - options.statistics = rocksdb::CreateDBStatistics(); - options = CurrentOptions(options); + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - BlockBasedTableOptions table_options; - switch (iter) { - case 0: - // only uncompressed block cache - table_options.block_cache = NewLRUCache(8*1024); - table_options.block_cache_compressed = nullptr; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 1: - // no block cache, only compressed cache - table_options.no_block_cache = true; - table_options.block_cache = nullptr; - table_options.block_cache_compressed = NewLRUCache(8*1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 2: - // both compressed and uncompressed block cache - table_options.block_cache = NewLRUCache(1024); - table_options.block_cache_compressed = NewLRUCache(8*1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 3: - // both block cache and compressed cache, but DB is not compressed - // also, make block cache sizes bigger, to trigger block cache hits - table_options.block_cache = NewLRUCache(1024 * 1024); - table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.compression = kNoCompression; - break; - default: - ASSERT_TRUE(false); - } + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); +} + +TEST_F(DBTest, OverlapInLevel0) { + do { + Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); - // default column family doesn't have block cache - Options no_block_cache_opts; - no_block_cache_opts.statistics = options.statistics; - no_block_cache_opts = CurrentOptions(no_block_cache_opts); - BlockBasedTableOptions table_options_no_bc; - table_options_no_bc.no_block_cache = true; - no_block_cache_opts.table_factory.reset( - NewBlockBasedTableFactory(table_options_no_bc)); - ReopenWithColumnFamilies({"default", "pikachu"}, - std::vector({no_block_cache_opts, options})); - Random rnd(301); + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > + // 0. + ASSERT_OK(Put(1, "100", "v100")); + ASSERT_OK(Put(1, "999", "v999")); + Flush(1); + MoveFilesToLevel(2, 1); + ASSERT_OK(Delete(1, "100")); + ASSERT_OK(Delete(1, "999")); + Flush(1); + MoveFilesToLevel(1, 1); + ASSERT_EQ("0,1,1", FilesPerLevel(1)); - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - std::vector values; - std::string str; - for (int i = 0; i < num_iter; i++) { - if (i % 4 == 0) { // high compression ratio - str = RandomString(&rnd, 1000); - } - values.push_back(str); - ASSERT_OK(Put(1, Key(i), values[i])); - } + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put(1, "300", "v300")); + ASSERT_OK(Put(1, "500", "v500")); + Flush(1); + ASSERT_OK(Put(1, "200", "v200")); + ASSERT_OK(Put(1, "600", "v600")); + ASSERT_OK(Put(1, "900", "v900")); + Flush(1); + ASSERT_EQ("2,1,1", FilesPerLevel(1)); - // flush all data from memtable so that reads are from block cache - ASSERT_OK(Flush(1)); + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); + ASSERT_EQ("2", FilesPerLevel(1)); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete(1, "600")); + Flush(1); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ("NOT_FOUND", Get(1, "600")); + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} +#endif // ROCKSDB_LITE - for (int i = 0; i < num_iter; i++) { - ASSERT_EQ(Get(1, Key(i)), values[i]); +TEST_F(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const override { + return "rocksdb.NewComparator"; + } + virtual int Compare(const Slice& a, const Slice& b) const override { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, + const Slice& l) const override { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const override { + BytewiseComparator()->FindShortSuccessor(key); } + }; + Options new_options, options; + NewComparator cmp; + do { + options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + new_options = CurrentOptions(); + new_options.comparator = &cmp; + // only the non-default column family has non-matching comparator + Status s = TryReopenWithColumnFamilies( + {"default", "pikachu"}, std::vector({options, new_options})); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); + } while (ChangeCompactOptions()); +} - // check that we triggered the appropriate code paths in the cache - switch (iter) { - case 0: - // only uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 1: - // no block cache, only compressed cache - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 2: - // both compressed and uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 3: - // both compressed and uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - // compressed doesn't have any hits since blocks are not compressed on - // storage - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); - break; - default: - ASSERT_TRUE(false); +TEST_F(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + virtual const char* Name() const override { + return "test.NumberComparator"; + } + virtual int Compare(const Slice& a, const Slice& b) const override { + return ToNumber(a) - ToNumber(b); + } + virtual void FindShortestSeparator(std::string* s, + const Slice& l) const override { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + virtual void FindShortSuccessor(std::string* key) const override { + ToNumber(*key); // Check format } - options.create_if_missing = true; - DestroyAndReopen(options); - } -} + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']') + << EscapeString(x); + int val; + char ignored; + EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.write_buffer_size = 4096; // Compact more often + new_options.arena_block_size = 4096; + new_options = CurrentOptions(new_options); + DestroyAndReopen(new_options); + CreateAndReopenWithCF({"pikachu"}, new_options); + ASSERT_OK(Put(1, "[10]", "ten")); + ASSERT_OK(Put(1, "[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get(1, "[10]")); + ASSERT_EQ("ten", Get(1, "[0xa]")); + ASSERT_EQ("twenty", Get(1, "[20]")); + ASSERT_EQ("twenty", Get(1, "[0x14]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); + Compact(1, "[0]", "[9999]"); + } -static std::string CompressibleString(Random* rnd, int len) { - std::string r; - test::CompressibleString(rnd, 0.8, len, &r); - return r; + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i * 10); + ASSERT_OK(Put(1, buf, buf)); + } + Compact(1, "[0]", "[1000000]"); + } + } while (ChangeCompactOptions()); } -TEST_F(DBTest, FailMoreDbPaths) { +TEST_F(DBTest, DBOpen_Options) { Options options = CurrentOptions(); - options.db_paths.emplace_back(dbname_, 10000000); - options.db_paths.emplace_back(dbname_ + "_2", 1000000); - options.db_paths.emplace_back(dbname_ + "_3", 1000000); - options.db_paths.emplace_back(dbname_ + "_4", 1000000); - options.db_paths.emplace_back(dbname_ + "_5", 1000000); - ASSERT_TRUE(TryReopen(options).IsNotSupported()); -} + std::string dbname = test::TmpDir(env_) + "/db_options_test"; + ASSERT_OK(DestroyDB(dbname, options)); -void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { - uint64_t cf_size = 0; - uint64_t cf_csize = 0; - size_t file_count = 0; - for (auto level_meta : cf_meta.levels) { - uint64_t level_size = 0; - uint64_t level_csize = 0; - file_count += level_meta.files.size(); - for (auto file_meta : level_meta.files) { - level_size += file_meta.size; - } - ASSERT_EQ(level_meta.size, level_size); - cf_size += level_size; - cf_csize += level_csize; - } - ASSERT_EQ(cf_meta.file_count, file_count); - ASSERT_EQ(cf_meta.size, cf_size); -} + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + options.create_if_missing = false; + Status s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); -TEST_F(DBTest, ColumnFamilyMetaDataTest) { - Options options = CurrentOptions(); + // Does not exist, and create_if_missing == true: OK options.create_if_missing = true; - DestroyAndReopen(options); - - Random rnd(301); - int key_index = 0; - ColumnFamilyMetaData cf_meta; - for (int i = 0; i < 100; ++i) { - GenerateNewFile(&rnd, &key_index); - db_->GetColumnFamilyMetaData(&cf_meta); - CheckColumnFamilyMeta(cf_meta); - } -} + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); -namespace { -void MinLevelHelper(DBTest* self, Options& options) { - Random rnd(301); + delete db; + db = nullptr; - for (int num = 0; - num < options.level0_file_num_compaction_trigger - 1; - num++) - { - std::vector values; - // Write 120KB (12 values, each 10K) - for (int i = 0; i < 12; i++) { - values.push_back(DBTestBase::RandomString(&rnd, 10000)); - ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); - } - self->dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); - } + // Does exist, and error_if_exists == true: error + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); - //generate one more file in level-0, and should trigger level-0 compaction - std::vector values; - for (int i = 0; i < 12; i++) { - values.push_back(DBTestBase::RandomString(&rnd, 10000)); - ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); - } - self->dbfull()->TEST_WaitForCompact(); + // Does exist, and error_if_exists == false: OK + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); - ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); - ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); + delete db; + db = nullptr; } -// returns false if the calling-Test should be skipped -bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, - int lev, int strategy) { - fprintf(stderr, "Test with compression options : window_bits = %d, level = %d, strategy = %d}\n", wbits, lev, strategy); - options.write_buffer_size = 100<<10; //100KB - options.arena_block_size = 4096; - options.num_levels = 3; - options.level0_file_num_compaction_trigger = 3; +TEST_F(DBTest, DBOpen_Change_NumLevels) { + Options options = CurrentOptions(); options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_TRUE(db_ != nullptr); + CreateAndReopenWithCF({"pikachu"}, options); - if (Snappy_Supported()) { - type = kSnappyCompression; - fprintf(stderr, "using snappy\n"); - } else if (Zlib_Supported()) { - type = kZlibCompression; - fprintf(stderr, "using zlib\n"); - } else if (BZip2_Supported()) { - type = kBZip2Compression; - fprintf(stderr, "using bzip2\n"); - } else if (LZ4_Supported()) { - type = kLZ4Compression; - fprintf(stderr, "using lz4\n"); - } else { - fprintf(stderr, "skipping test, compression disabled\n"); - return false; - } - options.compression_per_level.resize(options.num_levels); + ASSERT_OK(Put(1, "a", "123")); + ASSERT_OK(Put(1, "b", "234")); + Flush(1); + MoveFilesToLevel(3, 1); + Close(); - // do not compress L0 - for (int i = 0; i < 1; i++) { - options.compression_per_level[i] = kNoCompression; - } - for (int i = 1; i < options.num_levels; i++) { - options.compression_per_level[i] = type; - } - return true; + options.create_if_missing = false; + options.num_levels = 2; + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db_ == nullptr); } -} // namespace - -TEST_F(DBTest, MinLevelToCompress1) { - Options options = CurrentOptions(); - CompressionType type = kSnappyCompression; - if (!MinLevelToCompress(type, options, -14, -1, 0)) { - return; - } - Reopen(options); - MinLevelHelper(this, options); - // do not compress L0 and L1 - for (int i = 0; i < 2; i++) { - options.compression_per_level[i] = kNoCompression; - } - for (int i = 2; i < options.num_levels; i++) { - options.compression_per_level[i] = type; - } - DestroyAndReopen(options); - MinLevelHelper(this, options); -} +TEST_F(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::TmpDir(env_) + "/db_meta"; + ASSERT_OK(env_->CreateDirIfMissing(dbname)); + std::string metadbname = MetaDatabaseName(dbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metadbname)); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metametadbname)); -TEST_F(DBTest, MinLevelToCompress2) { + // Destroy previous versions if they exist. Using the long way. Options options = CurrentOptions(); - CompressionType type = kSnappyCompression; - if (!MinLevelToCompress(type, options, 15, -1, 0)) { - return; - } - Reopen(options); - MinLevelHelper(this, options); - - // do not compress L0 and L1 - for (int i = 0; i < 2; i++) { - options.compression_per_level[i] = kNoCompression; - } - for (int i = 2; i < options.num_levels; i++) { - options.compression_per_level[i] = type; - } - DestroyAndReopen(options); - MinLevelHelper(this, options); -} + ASSERT_OK(DestroyDB(metametadbname, options)); + ASSERT_OK(DestroyDB(metadbname, options)); + ASSERT_OK(DestroyDB(dbname, options)); -TEST_F(DBTest, RepeatedWritesToSameKey) { - do { - Options options; - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, options); + // Setup databases + DB* db = nullptr; + ASSERT_OK(DB::Open(options, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metametadbname, &db)); + delete db; + db = nullptr; - // We must have at most one file per level except for level-0, - // which may have up to kL0_StopWritesTrigger files. - const int kMaxFiles = - options.num_levels + options.level0_stop_writes_trigger; + // Delete databases + ASSERT_OK(DestroyDB(dbname, options)); - Random rnd(301); - std::string value = - RandomString(&rnd, static_cast(2 * options.write_buffer_size)); - for (int i = 0; i < 5 * kMaxFiles; i++) { - ASSERT_OK(Put(1, "key", value)); - ASSERT_LE(TotalTableFiles(1), kMaxFiles); - } - } while (ChangeCompactOptions()); + // Check if deletion worked. + options.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); } -TEST_F(DBTest, SparseMerge) { +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SnapshotFiles) { do { Options options = CurrentOptions(); - options.compression = kNoCompression; + options.write_buffer_size = 100000000; // Large write buffer CreateAndReopenWithCF({"pikachu"}, options); - FillLevels("A", "Z", 1); + Random rnd(301); - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put(1, "A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put((i < 40), Key(i), values[i])); } - Put(1, "C", "vc"); - ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - // Make sparse update - Put(1, "A", "va2"); - Put(1, "B100", "bvalue2"); - Put(1, "C", "vc2"); - ASSERT_OK(Flush(1)); + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - } while (ChangeCompactOptions()); -} + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(files, &manifest_size); -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -TEST_F(DBTest, ApproximateSizesMemTable) { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - options.create_if_missing = true; - options = CurrentOptions(options); - DestroyAndReopen(options); - - const int N = 128; - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - } - - uint64_t size; - std::string start = Key(50); - std::string end = Key(60); - Range r(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_GT(size, 6000); - ASSERT_LT(size, 204800); - // Zero if not including mem table - db_->GetApproximateSizes(&r, 1, &size, false); - ASSERT_EQ(size, 0); - - start = Key(500); - end = Key(600); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); - - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); - } + // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF) + ASSERT_EQ(files.size(), 5U); - start = Key(500); - end = Key(600); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); - - start = Key(100); - end = Key(1020); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_GT(size, 6000); + uint64_t number = 0; + FileType type; - options.max_write_buffer_number = 8; - options.min_write_buffer_number_to_merge = 5; - options.write_buffer_size = 1024 * N; // Not very large - DestroyAndReopen(options); + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + ASSERT_OK(env_->CreateDirIfMissing(snapdir)); - int keys[N * 3]; - for (int i = 0; i < N; i++) { - keys[i * 3] = i * 5; - keys[i * 3 + 1] = i * 5 + 1; - keys[i * 3 + 2] = i * 5 + 2; - } - std::random_shuffle(std::begin(keys), std::end(keys)); + for (size_t i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; - for (int i = 0; i < N * 3; i++) { - ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); - } + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); - start = Key(100); - end = Key(300); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > manifest_number) { + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + } + CopyFile(src, dest, size); + } - start = Key(1050); - end = Key(1080); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_GT(size, 6000); + // release file snapshot + dbfull()->DisableFileDeletions(); + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(0, Key(i), extras[i])); + } - start = Key(2100); - end = Key(2300); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, true); - ASSERT_EQ(size, 0); + // verify that data in the snapshot are correct + std::vector column_families; + column_families.emplace_back("default", ColumnFamilyOptions()); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); + std::vector cf_handles; + DB* snapdb; + DBOptions opts; + opts.env = env_; + opts.create_if_missing = false; + Status stat = + DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); + ASSERT_OK(stat); - start = Key(1050); - end = Key(1080); - r = Range(start, end); - uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(&r, 1, &size_with_mt, true); - ASSERT_GT(size_with_mt, 6000); - db_->GetApproximateSizes(&r, 1, &size_without_mt, false); - ASSERT_EQ(size_without_mt, 0); + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); + ASSERT_EQ(values[i].compare(val), 0); + } + for (auto cfh : cf_handles) { + delete cfh; + } + delete snapdb; - Flush(); + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(newfiles, &new_manifest_size); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); - } + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (size_t i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > new_manifest_number) { + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); - start = Key(1050); - end = Key(1080); - r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size_with_mt, true); - db_->GetApproximateSizes(&r, 1, &size_without_mt, false); - ASSERT_GT(size_with_mt, size_without_mt); - ASSERT_GT(size_without_mt, 6000); + // release file snapshot + dbfull()->DisableFileDeletions(); + } while (ChangeCompactOptions()); } +#endif -TEST_F(DBTest, ApproximateSizes) { +TEST_F(DBTest, CompactOnFlush) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - Options options; - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - options.create_if_missing = true; - options = CurrentOptions(options); - DestroyAndReopen(options); + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - const int N = 80; - static const int S1 = 100000; - static const int S2 = 105000; // Allow some expansion from metadata - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); - } + // Write two new keys + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); - // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); + // Case1: Delete followed by a put + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - ReopenWithColumnFamilies({"default", "pikachu"}, options); + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); - ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), - S2 * (i + 1))); - ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); - } - ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); - ASSERT_TRUE( - Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); - std::string cstart_str = Key(compact_start); - std::string cend_str = Key(compact_start + 9); - Slice cstart = cstart_str; - Slice cend = cend_str; - dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); - } + // Case 2: Delete followed by another delete + Delete(1, "foo"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); - } - // ApproximateOffsetOf() is not yet implemented in plain table format. - } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | - kSkipPlainTable | kSkipHashIndex)); -} - -TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { - do { - Options options = CurrentOptions(); - options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, options); + // Case 3: Put followed by a delete + Put(1, "foo", "v3"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(2), big1)); - ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(4), big1)); - ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); + // Case 4: Put followed by another Put + Put(1, "foo", "v4"); + Put(1, "foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - ReopenWithColumnFamilies({"default", "pikachu"}, options); + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + Put(1, "foo", "v6"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "v7"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); - ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - } - // ApproximateOffsetOf() is not yet implemented in plain table format. - } while (ChangeOptions(kSkipPlainTable)); + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "v8"); + Put(1, "foo", "v9"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); } -TEST_F(DBTest, IteratorPinsRef) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "hello"); - - // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); +TEST_F(DBTest, FlushOneColumnFamily) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); - // Write to force compactions - Put(1, "foo", "newvalue1"); - for (int i = 0; i < 100; i++) { - // 100K values - ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); - } - Put(1, "foo", "newvalue2"); + ASSERT_OK(Put(0, "Default", "Default")); + ASSERT_OK(Put(1, "pikachu", "pikachu")); + ASSERT_OK(Put(2, "ilya", "ilya")); + ASSERT_OK(Put(3, "muromec", "muromec")); + ASSERT_OK(Put(4, "dobrynia", "dobrynia")); + ASSERT_OK(Put(5, "nikitich", "nikitich")); + ASSERT_OK(Put(6, "alyosha", "alyosha")); + ASSERT_OK(Put(7, "popovich", "popovich")); - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("hello", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - delete iter; - } while (ChangeCompactOptions()); + for (int i = 0; i < 8; ++i) { + Flush(i); + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), i + 1U); + } } -TEST_F(DBTest, Snapshot) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); - Put(0, "foo", "0v1"); - Put(1, "foo", "1v1"); - - const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_EQ(1U, GetNumSnapshots()); - uint64_t time_snap1 = GetTimeOldestSnapshots(); - ASSERT_GT(time_snap1, 0U); - Put(0, "foo", "0v2"); - Put(1, "foo", "1v2"); - - env_->addon_time_.fetch_add(1); +TEST_F(DBTest, PurgeInfoLogs) { + Options options = CurrentOptions(); + options.keep_log_file_num = 5; + options.create_if_missing = true; + for (int mode = 0; mode <= 1; mode++) { + if (mode == 1) { + options.db_log_dir = dbname_ + "_logs"; + env_->CreateDirIfMissing(options.db_log_dir); + } else { + options.db_log_dir = ""; + } + for (int i = 0; i < 8; i++) { + Reopen(options); + } - const Snapshot* s2 = db_->GetSnapshot(); - ASSERT_EQ(2U, GetNumSnapshots()); - ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); - Put(0, "foo", "0v3"); - Put(1, "foo", "1v3"); + std::vector files; + env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir, + &files); + int info_log_count = 0; + for (std::string file : files) { + if (file.find("LOG") != std::string::npos) { + info_log_count++; + } + } + ASSERT_EQ(5, info_log_count); - { - ManagedSnapshot s3(db_); - ASSERT_EQ(3U, GetNumSnapshots()); - ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + Destroy(options); + // For mode (1), test DestroyDB() to delete all the logs under DB dir. + // For mode (2), no info log file should have been put under DB dir. + std::vector db_files; + env_->GetChildren(dbname_, &db_files); + for (std::string file : db_files) { + ASSERT_TRUE(file.find("LOG") == std::string::npos); + } - Put(0, "foo", "0v4"); - Put(1, "foo", "1v4"); - ASSERT_EQ("0v1", Get(0, "foo", s1)); - ASSERT_EQ("1v1", Get(1, "foo", s1)); - ASSERT_EQ("0v2", Get(0, "foo", s2)); - ASSERT_EQ("1v2", Get(1, "foo", s2)); - ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); - ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); + if (mode == 1) { + // Cleaning up + env_->GetChildren(options.db_log_dir, &files); + for (std::string file : files) { + env_->DeleteFile(options.db_log_dir + "/" + file); + } + env_->DeleteDir(options.db_log_dir); } + } +} - ASSERT_EQ(2U, GetNumSnapshots()); - ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); - ASSERT_EQ("0v1", Get(0, "foo", s1)); - ASSERT_EQ("1v1", Get(1, "foo", s1)); - ASSERT_EQ("0v2", Get(0, "foo", s2)); - ASSERT_EQ("1v2", Get(1, "foo", s2)); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); +#ifndef ROCKSDB_LITE +// Multi-threaded test: +namespace { - db_->ReleaseSnapshot(s1); - ASSERT_EQ("0v2", Get(0, "foo", s2)); - ASSERT_EQ("1v2", Get(1, "foo", s2)); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); - ASSERT_EQ(1U, GetNumSnapshots()); - ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); +static const int kColumnFamilies = 10; +static const int kNumThreads = 10; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; - db_->ReleaseSnapshot(s2); - ASSERT_EQ(0U, GetNumSnapshots()); - ASSERT_EQ("0v4", Get(0, "foo")); - ASSERT_EQ("1v4", Get(1, "foo")); - } while (ChangeOptions(kSkipHashCuckoo)); -} +struct MTState { + DBTest* test; + std::atomic stop; + std::atomic counter[kNumThreads]; + std::atomic thread_done[kNumThreads]; +}; -TEST_F(DBTest, HiddenValuesAreRemoved) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - CreateAndReopenWithCF({"pikachu"}, options); - Random rnd(301); - FillLevels("a", "z", 1); +struct MTThread { + MTState* state; + int id; +}; - std::string big = RandomString(&rnd, 50000); - Put(1, "foo", big); - Put(1, "pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put(1, "foo", "tiny"); - Put(1, "pastfoo2", "v2"); // Advance sequence number one more +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + int counter = 0; + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + char valbuf[1500]; + while (t->state->stop.load(std::memory_order_acquire) == false) { + t->state->counter[id].store(counter, std::memory_order_release); - ASSERT_OK(Flush(1)); - ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); - ASSERT_EQ(big, Get(1, "foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); - Slice x("x"); - dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); - dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + if (rnd.OneIn(2)) { + // Write values of the form . + // into each of the CFs + // We add some padding for force compactions. + int unique_id = rnd.Uniform(1000000); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); - // ApproximateOffsetOf() is not yet implemented in plain table format, - // which is used by Size(). - // skip HashCuckooRep as it does not support snapshot - } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | - kSkipPlainTable | kSkipHashCuckoo)); + // Half of the time directly use WriteBatch. Half of the time use + // WriteBatchWithIndex. + if (rnd.OneIn(2)) { + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + } else { + WriteBatchWithIndex batch(db->GetOptions().comparator); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); + } + } else { + // Read a value and verify that it matches the pattern written above + // and that writes to all column families were atomic (unique_id is the + // same) + std::vector keys(kColumnFamilies, Slice(keybuf)); + std::vector values; + std::vector statuses = + db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values); + Status s = statuses[0]; + // all statuses have to be the same + for (size_t i = 1; i < statuses.size(); ++i) { + // they are either both ok or both not-found + ASSERT_TRUE((s.ok() && statuses[i].ok()) || + (s.IsNotFound() && statuses[i].IsNotFound())); + } + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int unique_id = -1; + for (int i = 0; i < kColumnFamilies; ++i) { + int k, w, c, cf, u; + ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c, + &cf, &u)) + << values[i]; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); + ASSERT_EQ(cf, i); + if (i == 0) { + unique_id = u; + } else { + // this checks that updates across column families happened + // atomically -- all unique ids are the same + ASSERT_EQ(u, unique_id); + } + } + } + } + counter++; + } + t->state->thread_done[id].store(true, std::memory_order_release); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); } -TEST_F(DBTest, CompactBetweenSnapshots) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, options); - Random rnd(301); - FillLevels("a", "z", 1); +} // namespace - Put(1, "foo", "first"); - const Snapshot* snapshot1 = db_->GetSnapshot(); - Put(1, "foo", "second"); - Put(1, "foo", "third"); - Put(1, "foo", "fourth"); - const Snapshot* snapshot2 = db_->GetSnapshot(); - Put(1, "foo", "fifth"); - Put(1, "foo", "sixth"); +class MultiThreadedDBTest : public DBTest, + public ::testing::WithParamInterface { + public: + virtual void SetUp() override { option_config_ = GetParam(); } - // All entries (including duplicates) exist - // before any compaction or flush is triggered. - ASSERT_EQ(AllEntriesFor("foo", 1), - "[ sixth, fifth, fourth, third, second, first ]"); - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); - ASSERT_EQ("first", Get(1, "foo", snapshot1)); + static std::vector GenerateOptionConfigs() { + std::vector optionConfigs; + for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) { + // skip as HashCuckooRep does not support snapshot + if (optionConfig != kHashCuckoo) { + optionConfigs.push_back(optionConfig); + } + } + return optionConfigs; + } +}; - // After a flush, "second", "third" and "fifth" should - // be removed - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); +TEST_P(MultiThreadedDBTest, MultiThreaded) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + std::vector cfs; + for (int i = 1; i < kColumnFamilies; ++i) { + cfs.push_back(ToString(i)); + } + CreateAndReopenWithCF(cfs, CurrentOptions(options_override)); + // Initialize state + MTState mt; + mt.test = this; + mt.stop.store(false, std::memory_order_release); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].store(0, std::memory_order_release); + mt.thread_done[id].store(false, std::memory_order_release); + } - // after we release the snapshot1, only two values left - db_->ReleaseSnapshot(snapshot1); - FillLevels("a", "z", 1); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); + } - // We have only one valid snapshot snapshot2. Since snapshot1 is - // not valid anymore, "first" should be removed by a compaction. - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); - // after we release the snapshot2, only one value should be left - db_->ReleaseSnapshot(snapshot2); - FillLevels("a", "z", 1); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); - // skip HashCuckooRep as it does not support snapshot - } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction)); + // Stop the threads and wait for them to finish + mt.stop.store(true, std::memory_order_release); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].load(std::memory_order_acquire) == false) { + env_->SleepForMicroseconds(100000); + } + } } -TEST_F(DBTest, UnremovableSingleDelete) { - // If we compact: - // - // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2) - // - // We do not want to end up with: - // - // Put(A, v1) Snapshot Put(A, v2) - // - // Because a subsequent SingleDelete(A) would delete the Put(A, v2) - // but not Put(A, v1), so Get(A) would return v1. - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, options); - - Put(1, "foo", "first"); - const Snapshot* snapshot = db_->GetSnapshot(); - SingleDelete(1, "foo"); - Put(1, "foo", "second"); - ASSERT_OK(Flush(1)); - - ASSERT_EQ("first", Get(1, "foo", snapshot)); - ASSERT_EQ("second", Get(1, "foo")); +INSTANTIATE_TEST_CASE_P( + MultiThreaded, MultiThreadedDBTest, + ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs())); +#endif // ROCKSDB_LITE - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); +// Group commit test: +namespace { - SingleDelete(1, "foo"); +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; - ASSERT_EQ("first", Get(1, "foo", snapshot)); - ASSERT_EQ("NOT_FOUND", Get(1, "foo")); +struct GCThread { + DB* db; + int id; + std::atomic done; +}; - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; - ASSERT_EQ("first", Get(1, "foo", snapshot)); - ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - db_->ReleaseSnapshot(snapshot); - // Skip HashCuckooRep as it does not support single delete. FIFO and - // universal compaction do not apply to the test case. Skip MergePut - // because single delete does not get removed when it encounters a merge. - } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction | - kSkipUniversalCompaction | kSkipMergePut)); + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(ToString(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; } -TEST_F(DBTest, DeletionMarkers1) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); - ASSERT_OK(Flush(1)); - const int last = 2; - MoveFilesToLevel(last, 1); - // foo => v1 is now in last level - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); +} // namespace - // Place a table at level last-1 to prevent merging with preceding mutation - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); - MoveFilesToLevel(last - 1, 1); - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - - Delete(1, "foo"); - Put(1, "foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); - ASSERT_OK(Flush(1)); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - Slice z("z"); - dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); - // DEL eliminated, but v1 remains because we aren't compacting that level - // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); - // Merging last-1 w/ last, so we are the base level for "foo", so - // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); -} - -TEST_F(DBTest, DeletionMarkers2) { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); - ASSERT_OK(Flush(1)); - const int last = 2; - MoveFilesToLevel(last, 1); - // foo => v1 is now in last level - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - - // Place a table at level last-1 to prevent merging with preceding mutation - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); - MoveFilesToLevel(last - 1, 1); - ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); - ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - - Delete(1, "foo"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - ASSERT_OK(Flush(1)); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); - // DEL kept: "last" file overlaps - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); - // Merging last-1 w/ last, so we are the base level for "foo", so - // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); -} - -TEST_F(DBTest, OverlapInLevel0) { - do { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - - //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. - ASSERT_OK(Put(1, "100", "v100")); - ASSERT_OK(Put(1, "999", "v999")); - Flush(1); - MoveFilesToLevel(2, 1); - ASSERT_OK(Delete(1, "100")); - ASSERT_OK(Delete(1, "999")); - Flush(1); - MoveFilesToLevel(1, 1); - ASSERT_EQ("0,1,1", FilesPerLevel(1)); - - // Make files spanning the following ranges in level-0: - // files[0] 200 .. 900 - // files[1] 300 .. 500 - // Note that files are sorted by smallest key. - ASSERT_OK(Put(1, "300", "v300")); - ASSERT_OK(Put(1, "500", "v500")); - Flush(1); - ASSERT_OK(Put(1, "200", "v200")); - ASSERT_OK(Put(1, "600", "v600")); - ASSERT_OK(Put(1, "900", "v900")); - Flush(1); - ASSERT_EQ("2,1,1", FilesPerLevel(1)); - - // Compact away the placeholder files we created initially - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); - ASSERT_EQ("2", FilesPerLevel(1)); - - // Do a memtable compaction. Before bug-fix, the compaction would - // not detect the overlap with level-0 files and would incorrectly place - // the deletion in a deeper level. - ASSERT_OK(Delete(1, "600")); - Flush(1); - ASSERT_EQ("3", FilesPerLevel(1)); - ASSERT_EQ("NOT_FOUND", Get(1, "600")); - } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); -} - -TEST_F(DBTest, ComparatorCheck) { - class NewComparator : public Comparator { - public: - virtual const char* Name() const override { - return "rocksdb.NewComparator"; - } - virtual int Compare(const Slice& a, const Slice& b) const override { - return BytewiseComparator()->Compare(a, b); - } - virtual void FindShortestSeparator(std::string* s, - const Slice& l) const override { - BytewiseComparator()->FindShortestSeparator(s, l); - } - virtual void FindShortSuccessor(std::string* key) const override { - BytewiseComparator()->FindShortSuccessor(key); - } - }; - Options new_options, options; - NewComparator cmp; - do { - options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, options); - new_options = CurrentOptions(); - new_options.comparator = &cmp; - // only the non-default column family has non-matching comparator - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, - std::vector({options, new_options})); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) - << s.ToString(); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, CustomComparator) { - class NumberComparator : public Comparator { - public: - virtual const char* Name() const override { - return "test.NumberComparator"; - } - virtual int Compare(const Slice& a, const Slice& b) const override { - return ToNumber(a) - ToNumber(b); - } - virtual void FindShortestSeparator(std::string* s, - const Slice& l) const override { - ToNumber(*s); // Check format - ToNumber(l); // Check format - } - virtual void FindShortSuccessor(std::string* key) const override { - ToNumber(*key); // Check format - } - private: - static int ToNumber(const Slice& x) { - // Check that there are no extra characters. - EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']') - << EscapeString(x); - int val; - char ignored; - EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) - << EscapeString(x); - return val; - } - }; - Options new_options; - NumberComparator cmp; - do { - new_options = CurrentOptions(); - new_options.create_if_missing = true; - new_options.comparator = &cmp; - new_options.write_buffer_size = 4096; // Compact more often - new_options.arena_block_size = 4096; - new_options = CurrentOptions(new_options); - DestroyAndReopen(new_options); - CreateAndReopenWithCF({"pikachu"}, new_options); - ASSERT_OK(Put(1, "[10]", "ten")); - ASSERT_OK(Put(1, "[0x14]", "twenty")); - for (int i = 0; i < 2; i++) { - ASSERT_EQ("ten", Get(1, "[10]")); - ASSERT_EQ("ten", Get(1, "[0xa]")); - ASSERT_EQ("twenty", Get(1, "[20]")); - ASSERT_EQ("twenty", Get(1, "[0x14]")); - ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); - ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); - Compact(1, "[0]", "[9999]"); - } - - for (int run = 0; run < 2; run++) { - for (int i = 0; i < 1000; i++) { - char buf[100]; - snprintf(buf, sizeof(buf), "[%d]", i*10); - ASSERT_OK(Put(1, buf, buf)); - } - Compact(1, "[0]", "[1000000]"); - } - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, DBOpen_Options) { - Options options = CurrentOptions(); - std::string dbname = test::TmpDir(env_) + "/db_options_test"; - ASSERT_OK(DestroyDB(dbname, options)); - - // Does not exist, and create_if_missing == false: error - DB* db = nullptr; - options.create_if_missing = false; - Status s = DB::Open(options, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); - ASSERT_TRUE(db == nullptr); - - // Does not exist, and create_if_missing == true: OK - options.create_if_missing = true; - s = DB::Open(options, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != nullptr); - - delete db; - db = nullptr; - - // Does exist, and error_if_exists == true: error - options.create_if_missing = false; - options.error_if_exists = true; - s = DB::Open(options, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); - ASSERT_TRUE(db == nullptr); - - // Does exist, and error_if_exists == false: OK - options.create_if_missing = true; - options.error_if_exists = false; - s = DB::Open(options, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != nullptr); - - delete db; - db = nullptr; -} - -TEST_F(DBTest, DBOpen_Change_NumLevels) { - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - ASSERT_TRUE(db_ != nullptr); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "a", "123")); - ASSERT_OK(Put(1, "b", "234")); - Flush(1); - MoveFilesToLevel(3, 1); - Close(); - - options.create_if_missing = false; - options.num_levels = 2; - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); - ASSERT_TRUE(db_ == nullptr); -} - -TEST_F(DBTest, DestroyDBMetaDatabase) { - std::string dbname = test::TmpDir(env_) + "/db_meta"; - ASSERT_OK(env_->CreateDirIfMissing(dbname)); - std::string metadbname = MetaDatabaseName(dbname, 0); - ASSERT_OK(env_->CreateDirIfMissing(metadbname)); - std::string metametadbname = MetaDatabaseName(metadbname, 0); - ASSERT_OK(env_->CreateDirIfMissing(metametadbname)); - - // Destroy previous versions if they exist. Using the long way. - Options options = CurrentOptions(); - ASSERT_OK(DestroyDB(metametadbname, options)); - ASSERT_OK(DestroyDB(metadbname, options)); - ASSERT_OK(DestroyDB(dbname, options)); - - // Setup databases - DB* db = nullptr; - ASSERT_OK(DB::Open(options, dbname, &db)); - delete db; - db = nullptr; - ASSERT_OK(DB::Open(options, metadbname, &db)); - delete db; - db = nullptr; - ASSERT_OK(DB::Open(options, metametadbname, &db)); - delete db; - db = nullptr; - - // Delete databases - ASSERT_OK(DestroyDB(dbname, options)); - - // Check if deletion worked. - options.create_if_missing = false; - ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); -} - -// Check that number of files does not grow when writes are dropped -TEST_F(DBTest, DropWrites) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.paranoid_checks = false; - Reopen(options); - - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - Compact("a", "z"); - const size_t num_files = CountFiles(); - // Force out-of-space errors - env_->drop_writes_.store(true, std::memory_order_release); - env_->sleep_counter_.Reset(); - env_->no_sleep_ = true; - for (int i = 0; i < 5; i++) { - if (option_config_ != kUniversalCompactionMultiLevel && - option_config_ != kUniversalSubcompactions) { - for (int level = 0; level < dbfull()->NumberLevels(); level++) { - if (level > 0 && level == dbfull()->NumberLevels() - 1) { - break; - } - dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, - true /* disallow trivial move */); - } - } else { - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - } - } - - std::string property_value; - ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); - ASSERT_EQ("5", property_value); - - env_->drop_writes_.store(false, std::memory_order_release); - ASSERT_LT(CountFiles(), num_files + 3); - - // Check that compaction attempts slept after errors - // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler - // versions - ASSERT_GE(env_->sleep_counter_.Read(), 4); - } while (ChangeCompactOptions()); -} - -// Check background error counter bumped on flush failures. -TEST_F(DBTest, DropWritesFlush) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.max_background_flushes = 1; - Reopen(options); - - ASSERT_OK(Put("foo", "v1")); - // Force out-of-space errors - env_->drop_writes_.store(true, std::memory_order_release); - - std::string property_value; - // Background error count is 0 now. - ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); - ASSERT_EQ("0", property_value); - - dbfull()->TEST_FlushMemTable(true); - - ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); - ASSERT_EQ("1", property_value); - - env_->drop_writes_.store(false, std::memory_order_release); - } while (ChangeCompactOptions()); -} - -// Check that CompactRange() returns failure if there is not enough space left -// on device -TEST_F(DBTest, NoSpaceCompactRange) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.disable_auto_compactions = true; - Reopen(options); - - // generate 5 tables - for (int i = 0; i < 5; ++i) { - ASSERT_OK(Put(Key(i), Key(i) + "v")); - ASSERT_OK(Flush()); - } - - // Force out-of-space errors - env_->no_space_.store(true, std::memory_order_release); - - Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow trivial move */); - ASSERT_TRUE(s.IsIOError()); - - env_->no_space_.store(false, std::memory_order_release); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, NonWritableFileSystem) { - do { - Options options = CurrentOptions(); - options.write_buffer_size = 4096; - options.arena_block_size = 4096; - options.env = env_; - Reopen(options); - ASSERT_OK(Put("foo", "v1")); - env_->non_writeable_rate_.store(100); - std::string big(100000, 'x'); - int errors = 0; - for (int i = 0; i < 20; i++) { - if (!Put("foo", big).ok()) { - errors++; - env_->SleepForMicroseconds(100000); - } - } - ASSERT_GT(errors, 0); - env_->non_writeable_rate_.store(0); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, ManifestWriteError) { - // Test for the following problem: - // (a) Compaction produces file F - // (b) Log record containing F is written to MANIFEST file, but Sync() fails - // (c) GC deletes F - // (d) After reopening DB, reads fail since deleted F is named in log record - - // We iterate twice. In the second iteration, everything is the - // same except the log record never makes it to the MANIFEST file. - for (int iter = 0; iter < 2; iter++) { - std::atomic* error_type = (iter == 0) - ? &env_->manifest_sync_error_ - : &env_->manifest_write_error_; - - // Insert foo=>bar mapping - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.error_if_exists = false; - DestroyAndReopen(options); - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ("bar", Get("foo")); - - // Memtable compaction (will succeed) - Flush(); - ASSERT_EQ("bar", Get("foo")); - const int last = 2; - MoveFilesToLevel(2); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level - - // Merging compaction (will fail) - error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail - ASSERT_EQ("bar", Get("foo")); - - // Recovery: should not lose data - error_type->store(false, std::memory_order_release); - Reopen(options); - ASSERT_EQ("bar", Get("foo")); - } -} - -TEST_F(DBTest, PutFailsParanoid) { - // Test the following: - // (a) A random put fails in paranoid mode (simulate by sync fail) - // (b) All other puts have to fail, even if writes would succeed - // (c) All of that should happen ONLY if paranoid_checks = true - - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.error_if_exists = false; - options.paranoid_checks = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - Status s; - - ASSERT_OK(Put(1, "foo", "bar")); - ASSERT_OK(Put(1, "foo1", "bar1")); - // simulate error - env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); - env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); - // the next put should fail, too - ASSERT_TRUE(!s.ok()); - // but we're still able to read - ASSERT_EQ("bar", Get(1, "foo")); - - // do the same thing with paranoid checks off - options.paranoid_checks = false; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - ASSERT_OK(Put(1, "foo", "bar")); - ASSERT_OK(Put(1, "foo1", "bar1")); - // simulate error - env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); - env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); - // the next put should NOT fail - ASSERT_TRUE(s.ok()); -} - -TEST_F(DBTest, BloomFilter) { - do { - Options options = CurrentOptions(); - env_->count_random_reads_ = true; - options.env = env_; - // ChangeCompactOptions() only changes compaction style, which does not - // trigger reset of table_factory - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - CreateAndReopenWithCF({"pikachu"}, options); - - // Populate multiple layers - const int N = 10000; - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - Compact(1, "a", "z"); - for (int i = 0; i < N; i += 100) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - Flush(1); - - // Prevent auto compactions triggered by seeks - env_->delay_sstable_sync_.store(true, std::memory_order_release); - - // Lookup present keys. Should rarely read from small sstable. - env_->random_read_counter_.Reset(); - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - int reads = env_->random_read_counter_.Read(); - fprintf(stderr, "%d present => %d reads\n", N, reads); - ASSERT_GE(reads, N); - ASSERT_LE(reads, N + 2*N/100); - - // Lookup present keys. Should rarely read from either sstable. - env_->random_read_counter_.Reset(); - for (int i = 0; i < N; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); - } - reads = env_->random_read_counter_.Read(); - fprintf(stderr, "%d missing => %d reads\n", N, reads); - ASSERT_LE(reads, 3*N/100); - - env_->delay_sstable_sync_.store(false, std::memory_order_release); - Close(); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, BloomFilterRate) { - while (ChangeFilterOptions()) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - // Add a large key to make the file contain wide range - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - - // Check if filter is useful - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); - } - ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); - } -} - -TEST_F(DBTest, BloomFilterCompatibility) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // Create with block based filter - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check db with full filter - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); -} - -TEST_F(DBTest, BloomFilterReverseCompatibility) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // Create with full filter - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check db with block_based filter - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); -} - -namespace { -// A wrapped bloom over default FilterPolicy -class WrappedBloom : public FilterPolicy { - public: - explicit WrappedBloom(int bits_per_key) : - filter_(NewBloomFilterPolicy(bits_per_key)), - counter_(0) {} - - ~WrappedBloom() { delete filter_; } - - const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } - - void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst) - const override { - std::unique_ptr user_keys(new rocksdb::Slice[n]); - for (int i = 0; i < n; ++i) { - user_keys[i] = convertKey(keys[i]); - } - return filter_->CreateFilter(user_keys.get(), n, dst); - } - - bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter) - const override { - counter_++; - return filter_->KeyMayMatch(convertKey(key), filter); - } - - uint32_t GetCounter() { return counter_; } - - private: - const FilterPolicy* filter_; - mutable uint32_t counter_; - - rocksdb::Slice convertKey(const rocksdb::Slice& key) const { - return key; - } -}; -} // namespace - -TEST_F(DBTest, BloomFilterWrapper) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - - BlockBasedTableOptions table_options; - WrappedBloom* policy = new WrappedBloom(10); - table_options.filter_policy.reset(policy); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - CreateAndReopenWithCF({"pikachu"}, options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - // Add a large key to make the file contain wide range - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - ASSERT_EQ(0U, policy->GetCounter()); - Flush(1); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ(1U * maxKey, policy->GetCounter()); - - // Check if filter is useful - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); - } - ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); - ASSERT_EQ(2U * maxKey, policy->GetCounter()); -} - -TEST_F(DBTest, SnapshotFiles) { - do { - Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer - CreateAndReopenWithCF({"pikachu"}, options); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - std::vector values; - for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put((i < 40), Key(i), values[i])); - } - - // assert that nothing makes it to disk yet. - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - - // get a file snapshot - uint64_t manifest_number = 0; - uint64_t manifest_size = 0; - std::vector files; - dbfull()->DisableFileDeletions(); - dbfull()->GetLiveFiles(files, &manifest_size); - - // CURRENT, MANIFEST, *.sst files (one for each CF) - ASSERT_EQ(files.size(), 4U); - - uint64_t number = 0; - FileType type; - - // copy these files to a new snapshot directory - std::string snapdir = dbname_ + ".snapdir/"; - ASSERT_OK(env_->CreateDirIfMissing(snapdir)); - - for (unsigned int i = 0; i < files.size(); i++) { - // our clients require that GetLiveFiles returns - // files with "/" as first character! - ASSERT_EQ(files[i][0], '/'); - std::string src = dbname_ + files[i]; - std::string dest = snapdir + files[i]; - - uint64_t size; - ASSERT_OK(env_->GetFileSize(src, &size)); - - // record the number and the size of the - // latest manifest file - if (ParseFileName(files[i].substr(1), &number, &type)) { - if (type == kDescriptorFile) { - if (number > manifest_number) { - manifest_number = number; - ASSERT_GE(size, manifest_size); - size = manifest_size; // copy only valid MANIFEST data - } - } - } - CopyFile(src, dest, size); - } - - // release file snapshot - dbfull()->DisableFileDeletions(); - // overwrite one key, this key should not appear in the snapshot - std::vector extras; - for (unsigned int i = 0; i < 1; i++) { - extras.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(0, Key(i), extras[i])); - } - - // verify that data in the snapshot are correct - std::vector column_families; - column_families.emplace_back("default", ColumnFamilyOptions()); - column_families.emplace_back("pikachu", ColumnFamilyOptions()); - std::vector cf_handles; - DB* snapdb; - DBOptions opts; - opts.env = env_; - opts.create_if_missing = false; - Status stat = - DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); - ASSERT_OK(stat); - - ReadOptions roptions; - std::string val; - for (unsigned int i = 0; i < 80; i++) { - stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); - ASSERT_EQ(values[i].compare(val), 0); - } - for (auto cfh : cf_handles) { - delete cfh; - } - delete snapdb; - - // look at the new live files after we added an 'extra' key - // and after we took the first snapshot. - uint64_t new_manifest_number = 0; - uint64_t new_manifest_size = 0; - std::vector newfiles; - dbfull()->DisableFileDeletions(); - dbfull()->GetLiveFiles(newfiles, &new_manifest_size); - - // find the new manifest file. assert that this manifest file is - // the same one as in the previous snapshot. But its size should be - // larger because we added an extra key after taking the - // previous shapshot. - for (unsigned int i = 0; i < newfiles.size(); i++) { - std::string src = dbname_ + "/" + newfiles[i]; - // record the lognumber and the size of the - // latest manifest file - if (ParseFileName(newfiles[i].substr(1), &number, &type)) { - if (type == kDescriptorFile) { - if (number > new_manifest_number) { - uint64_t size; - new_manifest_number = number; - ASSERT_OK(env_->GetFileSize(src, &size)); - ASSERT_GE(size, new_manifest_size); - } - } - } - } - ASSERT_EQ(manifest_number, new_manifest_number); - ASSERT_GT(new_manifest_size, manifest_size); - - // release file snapshot - dbfull()->DisableFileDeletions(); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, CompactOnFlush) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, options); - - Put(1, "foo", "v1"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); - - // Write two new keys - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); - - // Case1: Delete followed by a put - Delete(1, "foo"); - Put(1, "foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); - - // After the current memtable is flushed, the DEL should - // have been removed - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); - - // Case 2: Delete followed by another delete - Delete(1, "foo"); - Delete(1, "foo"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 3: Put followed by a delete - Put(1, "foo", "v3"); - Delete(1, "foo"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 4: Put followed by another Put - Put(1, "foo", "v4"); - Put(1, "foo", "v5"); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - - // clear database - Delete(1, "foo"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 5: Put followed by snapshot followed by another Put - // Both puts should remain. - Put(1, "foo", "v6"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put(1, "foo", "v7"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); - db_->ReleaseSnapshot(snapshot); - - // clear database - Delete(1, "foo"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); - - // Case 5: snapshot followed by a put followed by another Put - // Only the last put should remain. - const Snapshot* snapshot1 = db_->GetSnapshot(); - Put(1, "foo", "v8"); - Put(1, "foo", "v9"); - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); - db_->ReleaseSnapshot(snapshot1); - } while (ChangeCompactOptions()); -} - -namespace { -std::vector ListSpecificFiles( - Env* env, const std::string& path, const FileType expected_file_type) { - std::vector files; - std::vector file_numbers; - env->GetChildren(path, &files); - uint64_t number; - FileType type; - for (size_t i = 0; i < files.size(); ++i) { - if (ParseFileName(files[i], &number, &type)) { - if (type == expected_file_type) { - file_numbers.push_back(number); - } - } - } - return std::move(file_numbers); -} - -std::vector ListTableFiles(Env* env, const std::string& path) { - return ListSpecificFiles(env, path, kTableFile); -} -} // namespace - -TEST_F(DBTest, FlushOneColumnFamily) { - Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", - "alyosha", "popovich"}, - options); - - ASSERT_OK(Put(0, "Default", "Default")); - ASSERT_OK(Put(1, "pikachu", "pikachu")); - ASSERT_OK(Put(2, "ilya", "ilya")); - ASSERT_OK(Put(3, "muromec", "muromec")); - ASSERT_OK(Put(4, "dobrynia", "dobrynia")); - ASSERT_OK(Put(5, "nikitich", "nikitich")); - ASSERT_OK(Put(6, "alyosha", "alyosha")); - ASSERT_OK(Put(7, "popovich", "popovich")); - - for (int i = 0; i < 8; ++i) { - Flush(i); - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), i + 1U); - } -} - -// In https://reviews.facebook.net/D20661 we change -// recovery behavior: previously for each log file each column family -// memtable was flushed, even it was empty. Now it's changed: -// we try to create the smallest number of table files by merging -// updates from multiple logs -TEST_F(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) { - Options options = CurrentOptions(); - options.write_buffer_size = 5000000; - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - - // Since we will reopen DB with smaller write_buffer_size, - // each key will go to new SST file - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - - ASSERT_OK(Put(3, Key(10), DummyString(1))); - // Make 'dobrynia' to be flushed and new WAL file to be created - ASSERT_OK(Put(2, Key(10), DummyString(7500000))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(1)); - // Make sure 'dobrynia' was flushed: check sst files amount - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - } - // New WAL file - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - - options.write_buffer_size = 4096; - options.arena_block_size = 4096; - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - options); - { - // No inserts => default is empty - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(5)); - // 1 SST for big key + 1 SST for small one - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(2)); - // 1 SST for all keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } -} - -// In https://reviews.facebook.net/D20661 we change -// recovery behavior: previously for each log file each column family -// memtable was flushed, even it wasn't empty. Now it's changed: -// we try to create the smallest number of table files by merging -// updates from multiple logs -TEST_F(DBTest, RecoverCheckFileAmount) { - Options options = CurrentOptions(); - options.write_buffer_size = 100000; - options.arena_block_size = 4 * 1024; - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - // Make 'nikitich' memtable to be flushed - ASSERT_OK(Put(3, Key(10), DummyString(1002400))); - ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // 4 memtable are not flushed, 1 sst file - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } - // Memtable for 'nikitich' has flushed, new WAL file has opened - // 4 memtable still not flushed - - // Write to new WAL file - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - // Fill up 'nikitich' one more time - ASSERT_OK(Put(3, Key(10), DummyString(1002400))); - // make it flush - ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // There are still 4 memtable not flushed, and 2 sst tables - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - options); - { - std::vector table_files = ListTableFiles(env_, dbname_); - // Check, that records for 'default', 'dobrynia' and 'pikachu' from - // first, second and third WALs went to the same SST. - // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for - // 'dobrynia', one for 'pikachu' - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(3)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - } -} - -TEST_F(DBTest, SharedWriteBuffer) { - Options options = CurrentOptions(); - options.db_write_buffer_size = 100000; // this is the real limit - options.write_buffer_size = 500000; // this is never hit - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - - // Trigger a flush on every CF - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(3, Key(1), DummyString(90000))); - ASSERT_OK(Put(2, Key(2), DummyString(20000))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } - - // Flush 'dobrynia' and 'nikitich' - ASSERT_OK(Put(2, Key(2), DummyString(50000))); - ASSERT_OK(Put(3, Key(2), DummyString(40000))); - ASSERT_OK(Put(2, Key(3), DummyString(20000))); - ASSERT_OK(Put(3, Key(2), DummyString(40000))); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - // Make 'dobrynia' and 'nikitich' both take up 40% of space - // When 'pikachu' puts us over 100%, all 3 flush. - ASSERT_OK(Put(2, Key(2), DummyString(40000))); - ASSERT_OK(Put(1, Key(2), DummyString(20000))); - ASSERT_OK(Put(0, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(3)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(3)); - } - - // Some remaining writes so 'default' and 'nikitich' flush on closure. - ASSERT_OK(Put(3, Key(1), DummyString(1))); - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - options); - { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(3)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(4)); - } -} - -TEST_F(DBTest, PurgeInfoLogs) { - Options options = CurrentOptions(); - options.keep_log_file_num = 5; - options.create_if_missing = true; - for (int mode = 0; mode <= 1; mode++) { - if (mode == 1) { - options.db_log_dir = dbname_ + "_logs"; - env_->CreateDirIfMissing(options.db_log_dir); - } else { - options.db_log_dir = ""; - } - for (int i = 0; i < 8; i++) { - Reopen(options); - } - - std::vector files; - env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir, - &files); - int info_log_count = 0; - for (std::string file : files) { - if (file.find("LOG") != std::string::npos) { - info_log_count++; - } - } - ASSERT_EQ(5, info_log_count); - - Destroy(options); - // For mode (1), test DestroyDB() to delete all the logs under DB dir. - // For mode (2), no info log file should have been put under DB dir. - std::vector db_files; - env_->GetChildren(dbname_, &db_files); - for (std::string file : db_files) { - ASSERT_TRUE(file.find("LOG") == std::string::npos); - } - - if (mode == 1) { - // Cleaning up - env_->GetChildren(options.db_log_dir, &files); - for (std::string file : files) { - env_->DeleteFile(options.db_log_dir + "/" + file); - } - env_->DeleteDir(options.db_log_dir); - } - } -} - -TEST_F(DBTest, SyncMultipleLogs) { - const uint64_t kNumBatches = 2; - const int kBatchSize = 1000; - - Options options = CurrentOptions(); - options.create_if_missing = true; - options.write_buffer_size = 4096; - Reopen(options); - - WriteBatch batch; - WriteOptions wo; - wo.sync = true; - - for (uint64_t b = 0; b < kNumBatches; b++) { - batch.Clear(); - for (int i = 0; i < kBatchSize; i++) { - batch.Put(Key(i), DummyString(128)); - } - - dbfull()->Write(wo, &batch); - } - - ASSERT_OK(dbfull()->SyncWAL()); -} - -// -// Test WAL recovery for the various modes available -// -class RecoveryTestHelper { - public: - // Number of WAL files to generate - static const int kWALFilesCount = 10; - // Starting number for the WAL file name like 00010.log - static const int kWALFileOffset = 10; - // Keys to be written per WAL file - static const int kKeysPerWALFile = 1024; - // Size of the value - static const int kValueSize = 10; - - // Create WAL files with values filled in - static void FillData(DBTest* test, Options& options, const size_t wal_count, - size_t& count) { - DBOptions& db_options = options; - - count = 0; - - shared_ptr table_cache = NewLRUCache(50000, 16); - EnvOptions env_options; - WriteBuffer write_buffer(db_options.db_write_buffer_size); - - unique_ptr versions; - unique_ptr wal_manager; - WriteController write_controller; - - versions.reset(new VersionSet(test->dbname_, &db_options, env_options, - table_cache.get(), &write_buffer, - &write_controller)); - - wal_manager.reset(new WalManager(db_options, env_options)); - - std::unique_ptr current_log_writer; - - for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { - uint64_t current_log_number = j; - std::string fname = LogFileName(test->dbname_, current_log_number); - unique_ptr file; - ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); - unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options)); - current_log_writer.reset(new log::Writer(std::move(file_writer))); - - for (int i = 0; i < kKeysPerWALFile; i++) { - std::string key = "key" + ToString(count++); - std::string value = test->DummyString(kValueSize); - assert(current_log_writer.get() != nullptr); - uint64_t seq = versions->LastSequence() + 1; - WriteBatch batch; - batch.Put(key, value); - WriteBatchInternal::SetSequence(&batch, seq); - current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); - versions->SetLastSequence(seq); - } - } - } - - // Recreate and fill the store with some data - static size_t FillData(DBTest* test, Options& options) { - options.create_if_missing = true; - test->DestroyAndReopen(options); - test->Close(); - - size_t count = 0; - FillData(test, options, kWALFilesCount, count); - return count; - } - - // Read back all the keys we wrote and return the number of keys found - static size_t GetData(DBTest* test) { - size_t count = 0; - for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { - if (test->Get("key" + ToString(i)) != "NOT_FOUND") { - ++count; - } - } - return count; - } - - // Manuall corrupt the specified WAL - static void CorruptWAL(DBTest* test, Options& options, const double off, - const double len, const int wal_file_id, - const bool trunc = false) { - Env* env = options.env; - std::string fname = LogFileName(test->dbname_, wal_file_id); - uint64_t size; - ASSERT_OK(env->GetFileSize(fname, &size)); - ASSERT_GT(size, 0); -#ifdef OS_WIN - // Windows disk cache behaves differently. When we truncate - // the original content is still in the cache due to the original - // handle is still open. Generally, in Windows, one prohibits - // shared access to files and it is not needed for WAL but we allow - // it to induce corruption at various tests. - test->Close(); -#endif - if (trunc) { - ASSERT_EQ(0, truncate(fname.c_str(), size * off)); - } else { - InduceCorruption(fname, size * off, size * len); - } - } - - // Overwrite data with 'a' from offset for length len - static void InduceCorruption(const std::string& filename, uint32_t offset, - uint32_t len) { - ASSERT_GT(len, 0); - - int fd = open(filename.c_str(), O_RDWR); - - ASSERT_GT(fd, 0); - ASSERT_EQ(offset, lseek(fd, offset, SEEK_SET)); - - void* buf = alloca(len); - memset(buf, 'a', len); - ASSERT_EQ(len, write(fd, buf, len)); - - close(fd); - } -}; - -// Test scope: -// - We expect to open the data store when there is incomplete trailing writes -// at the end of any of the logs -// - We do not expect to open the data store for corruption -TEST_F(DBTest, kTolerateCorruptedTailRecords) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset position */ - for (int j = jstart; j < jend; j++) { /* WAL file */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - // test checksum failure or parsing - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, /*wal=*/j, trunc); - - if (trunc) { - options.wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - const size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_TRUE(i == 0 || recovered_row_count > 0); - ASSERT_LT(recovered_row_count, row_count); - } else { - options.wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords; - ASSERT_NOK(TryReopen(options)); - } - } - } - } -} - -// Test scope: -// We don't expect the data store to be opened if there is any corruption -// (leading, middle or trailing -- incomplete writes or corruption) -TEST_F(DBTest, kAbsoluteConsistency) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - // Verify clean slate behavior - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset position */ - if (trunc && i == 0) { - continue; - } - - for (int j = jstart; j < jend; j++) { /* wal files */ - // fill with new date - RecoveryTestHelper::FillData(this, options); - // corrupt the wal - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - // verify - options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; - options.create_if_missing = false; - ASSERT_NOK(TryReopen(options)); - } - } - } -} - -// Test scope: -// - We expect to open data store under all circumstances -// - We expect only data upto the point where the first error was encountered -TEST_F(DBTest, kPointInTimeRecovery) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - const int maxkeys = - RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Offset of corruption */ - for (int j = jstart; j < jend; j++) { /* WAL file */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - - // Corrupt the wal - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - - // Verify - options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - - // Probe data for invariants - size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_LT(recovered_row_count, row_count); - - bool expect_data = true; - for (size_t k = 0; k < maxkeys; ++k) { - bool found = Get("key" + ToString(i)) != "NOT_FOUND"; - if (expect_data && !found) { - expect_data = false; - } - ASSERT_EQ(found, expect_data); - } - - const size_t min = RecoveryTestHelper::kKeysPerWALFile * - (j - RecoveryTestHelper::kWALFileOffset); - ASSERT_GE(recovered_row_count, min); - if (!trunc && i != 0) { - const size_t max = RecoveryTestHelper::kKeysPerWALFile * - (j - RecoveryTestHelper::kWALFileOffset + 1); - ASSERT_LE(recovered_row_count, max); - } - } - } - } -} - -// Test scope: -// - We expect to open the data store under all scenarios -// - We expect to have recovered records past the corruption zone -TEST_F(DBTest, kSkipAnyCorruptedRecords) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset */ - for (int j = jstart; j < jend; j++) { /* wal files */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, options); - - // Corrupt the WAL - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - - // Verify behavior - options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - - // Probe data for invariants - size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_LT(recovered_row_count, row_count); - - if (!trunc) { - ASSERT_TRUE(i != 0 || recovered_row_count > 0); - } - } - } - } -} - - -// Multi-threaded test: -namespace { - -static const int kColumnFamilies = 10; -static const int kNumThreads = 10; -static const int kTestSeconds = 10; -static const int kNumKeys = 1000; - -struct MTState { - DBTest* test; - std::atomic stop; - std::atomic counter[kNumThreads]; - std::atomic thread_done[kNumThreads]; -}; - -struct MTThread { - MTState* state; - int id; -}; - -static void MTThreadBody(void* arg) { - MTThread* t = reinterpret_cast(arg); - int id = t->id; - DB* db = t->state->test->db_; - int counter = 0; - fprintf(stderr, "... starting thread %d\n", id); - Random rnd(1000 + id); - char valbuf[1500]; - while (t->state->stop.load(std::memory_order_acquire) == false) { - t->state->counter[id].store(counter, std::memory_order_release); - - int key = rnd.Uniform(kNumKeys); - char keybuf[20]; - snprintf(keybuf, sizeof(keybuf), "%016d", key); - - if (rnd.OneIn(2)) { - // Write values of the form . - // into each of the CFs - // We add some padding for force compactions. - int unique_id = rnd.Uniform(1000000); - - // Half of the time directly use WriteBatch. Half of the time use - // WriteBatchWithIndex. - if (rnd.OneIn(2)) { - WriteBatch batch; - for (int cf = 0; cf < kColumnFamilies; ++cf) { - snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, - static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); - } - ASSERT_OK(db->Write(WriteOptions(), &batch)); - } else { - WriteBatchWithIndex batch(db->GetOptions().comparator); - for (int cf = 0; cf < kColumnFamilies; ++cf) { - snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, - static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); - } - ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); - } - } else { - // Read a value and verify that it matches the pattern written above - // and that writes to all column families were atomic (unique_id is the - // same) - std::vector keys(kColumnFamilies, Slice(keybuf)); - std::vector values; - std::vector statuses = - db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values); - Status s = statuses[0]; - // all statuses have to be the same - for (size_t i = 1; i < statuses.size(); ++i) { - // they are either both ok or both not-found - ASSERT_TRUE((s.ok() && statuses[i].ok()) || - (s.IsNotFound() && statuses[i].IsNotFound())); - } - if (s.IsNotFound()) { - // Key has not yet been written - } else { - // Check that the writer thread counter is >= the counter in the value - ASSERT_OK(s); - int unique_id = -1; - for (int i = 0; i < kColumnFamilies; ++i) { - int k, w, c, cf, u; - ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, - &c, &cf, &u)) - << values[i]; - ASSERT_EQ(k, key); - ASSERT_GE(w, 0); - ASSERT_LT(w, kNumThreads); - ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); - ASSERT_EQ(cf, i); - if (i == 0) { - unique_id = u; - } else { - // this checks that updates across column families happened - // atomically -- all unique ids are the same - ASSERT_EQ(u, unique_id); - } - } - } - } - counter++; - } - t->state->thread_done[id].store(true, std::memory_order_release); - fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); -} - -} // namespace - -class MultiThreadedDBTest : public DBTest, - public ::testing::WithParamInterface { - public: - virtual void SetUp() override { option_config_ = GetParam(); } - - static std::vector GenerateOptionConfigs() { - std::vector optionConfigs; - for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) { - // skip as HashCuckooRep does not support snapshot - if (optionConfig != kHashCuckoo) { - optionConfigs.push_back(optionConfig); - } - } - return optionConfigs; - } -}; - -TEST_P(MultiThreadedDBTest, MultiThreaded) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - std::vector cfs; - for (int i = 1; i < kColumnFamilies; ++i) { - cfs.push_back(ToString(i)); - } - CreateAndReopenWithCF(cfs, CurrentOptions(options_override)); - // Initialize state - MTState mt; - mt.test = this; - mt.stop.store(false, std::memory_order_release); - for (int id = 0; id < kNumThreads; id++) { - mt.counter[id].store(0, std::memory_order_release); - mt.thread_done[id].store(false, std::memory_order_release); - } - - // Start threads - MTThread thread[kNumThreads]; - for (int id = 0; id < kNumThreads; id++) { - thread[id].state = &mt; - thread[id].id = id; - env_->StartThread(MTThreadBody, &thread[id]); - } - - // Let them run for a while - env_->SleepForMicroseconds(kTestSeconds * 1000000); - - // Stop the threads and wait for them to finish - mt.stop.store(true, std::memory_order_release); - for (int id = 0; id < kNumThreads; id++) { - while (mt.thread_done[id].load(std::memory_order_acquire) == false) { - env_->SleepForMicroseconds(100000); - } - } -} - -INSTANTIATE_TEST_CASE_P( - MultiThreaded, MultiThreadedDBTest, - ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs())); - -// Group commit test: -namespace { - -static const int kGCNumThreads = 4; -static const int kGCNumKeys = 1000; - -struct GCThread { - DB* db; - int id; - std::atomic done; -}; - -static void GCThreadBody(void* arg) { - GCThread* t = reinterpret_cast(arg); - int id = t->id; - DB* db = t->db; - WriteOptions wo; - - for (int i = 0; i < kGCNumKeys; ++i) { - std::string kv(ToString(i + id * kGCNumKeys)); - ASSERT_OK(db->Put(wo, kv, kv)); - } - t->done = true; -} - -} // namespace - -TEST_F(DBTest, GroupCommitTest) { - do { - Options options = CurrentOptions(); - options.env = env_; - env_->log_write_slowdown_.store(100); - options.statistics = rocksdb::CreateDBStatistics(); - Reopen(options); - - // Start threads - GCThread thread[kGCNumThreads]; - for (int id = 0; id < kGCNumThreads; id++) { - thread[id].id = id; - thread[id].db = db_; - thread[id].done = false; - env_->StartThread(GCThreadBody, &thread[id]); - } - - for (int id = 0; id < kGCNumThreads; id++) { - while (thread[id].done == false) { - env_->SleepForMicroseconds(100000); - } - } - env_->log_write_slowdown_.store(0); - - ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); - - std::vector expected_db; - for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { - expected_db.push_back(ToString(i)); - } - sort(expected_db.begin(), expected_db.end()); - - Iterator* itr = db_->NewIterator(ReadOptions()); - itr->SeekToFirst(); - for (auto x : expected_db) { - ASSERT_TRUE(itr->Valid()); - ASSERT_EQ(itr->key().ToString(), x); - ASSERT_EQ(itr->value().ToString(), x); - itr->Next(); - } - ASSERT_TRUE(!itr->Valid()); - delete itr; - - HistogramData hist_data = {0}; - options.statistics->histogramData(DB_WRITE, &hist_data); - ASSERT_GT(hist_data.average, 0.0); - } while (ChangeOptions(kSkipNoSeekToLast)); -} - -namespace { -typedef std::map KVMap; -} - -class ModelDB: public DB { - public: - class ModelSnapshot : public Snapshot { - public: - KVMap map_; - - virtual SequenceNumber GetSequenceNumber() const override { - // no need to call this - assert(false); - return 0; - } - }; - - explicit ModelDB(const Options& options) : options_(options) {} - using DB::Put; - virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& k, const Slice& v) override { - WriteBatch batch; - batch.Put(cf, k, v); - return Write(o, &batch); - } - using DB::Delete; - virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& key) override { - WriteBatch batch; - batch.Delete(cf, key); - return Write(o, &batch); - } - using DB::SingleDelete; - virtual Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& key) override { - WriteBatch batch; - batch.SingleDelete(cf, key); - return Write(o, &batch); - } - using DB::Merge; - virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, - const Slice& k, const Slice& v) override { - WriteBatch batch; - batch.Merge(cf, k, v); - return Write(o, &batch); - } - using DB::Get; - virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, - const Slice& key, std::string* value) override { - return Status::NotSupported(key); - } - - using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, - std::vector* values) override { - std::vector s(keys.size(), - Status::NotSupported("Not implemented.")); - return s; - } - - using DB::AddFile; - virtual Status AddFile(ColumnFamilyHandle* column_family, - const ExternalSstFileInfo* file_path, - bool move_file) override { - return Status::NotSupported("Not implemented."); - } - virtual Status AddFile(ColumnFamilyHandle* column_family, - const std::string& file_path, - bool move_file) override { - return Status::NotSupported("Not implemented."); - } - - using DB::GetPropertiesOfAllTables; - virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override { - return Status(); - } - - using DB::KeyMayExist; - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, - bool* value_found = nullptr) override { - if (value_found != nullptr) { - *value_found = false; - } - return true; // Not Supported directly - } - using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override { - if (options.snapshot == nullptr) { - KVMap* saved = new KVMap; - *saved = map_; - return new ModelIter(saved, true); - } else { - const KVMap* snapshot_state = - &(reinterpret_cast(options.snapshot)->map_); - return new ModelIter(snapshot_state, false); - } - } - virtual Status NewIterators( - const ReadOptions& options, - const std::vector& column_family, - std::vector* iterators) override { - return Status::NotSupported("Not supported yet"); - } - virtual const Snapshot* GetSnapshot() override { - ModelSnapshot* snapshot = new ModelSnapshot; - snapshot->map_ = map_; - return snapshot; - } - - virtual void ReleaseSnapshot(const Snapshot* snapshot) override { - delete reinterpret_cast(snapshot); - } - - virtual Status Write(const WriteOptions& options, - WriteBatch* batch) override { - class Handler : public WriteBatch::Handler { - public: - KVMap* map_; - virtual void Put(const Slice& key, const Slice& value) override { - (*map_)[key.ToString()] = value.ToString(); - } - virtual void Merge(const Slice& key, const Slice& value) override { - // ignore merge for now - //(*map_)[key.ToString()] = value.ToString(); - } - virtual void Delete(const Slice& key) override { - map_->erase(key.ToString()); - } - }; - Handler handler; - handler.map_ = &map_; - return batch->Iterate(&handler); - } - - using DB::GetProperty; - virtual bool GetProperty(ColumnFamilyHandle* column_family, - const Slice& property, std::string* value) override { - return false; - } - using DB::GetIntProperty; - virtual bool GetIntProperty(ColumnFamilyHandle* column_family, - const Slice& property, uint64_t* value) override { - return false; - } - using DB::GetApproximateSizes; - virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, - bool include_memtable) override { - for (int i = 0; i < n; i++) { - sizes[i] = 0; - } - } - using DB::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* start, const Slice* end) override { - return Status::NotSupported("Not supported operation."); - } - - using DB::CompactFiles; - virtual Status CompactFiles( - const CompactionOptions& compact_options, - ColumnFamilyHandle* column_family, - const std::vector& input_file_names, - const int output_level, const int output_path_id = -1) override { - return Status::NotSupported("Not supported operation."); - } - - Status PauseBackgroundWork() override { - return Status::NotSupported("Not supported operation."); - } - - Status ContinueBackgroundWork() override { - return Status::NotSupported("Not supported operation."); - } - - using DB::NumberLevels; - virtual int NumberLevels(ColumnFamilyHandle* column_family) override { - return 1; - } - - using DB::MaxMemCompactionLevel; - virtual int MaxMemCompactionLevel( - ColumnFamilyHandle* column_family) override { - return 1; - } - - using DB::Level0StopWriteTrigger; - virtual int Level0StopWriteTrigger( - ColumnFamilyHandle* column_family) override { - return -1; - } - - virtual const std::string& GetName() const override { return name_; } - - virtual Env* GetEnv() const override { return nullptr; } - - using DB::GetOptions; - virtual const Options& GetOptions( - ColumnFamilyHandle* column_family) const override { - return options_; - } - - using DB::GetDBOptions; - virtual const DBOptions& GetDBOptions() const override { return options_; } - - using DB::Flush; - virtual Status Flush(const rocksdb::FlushOptions& options, - ColumnFamilyHandle* column_family) override { - Status ret; - return ret; - } - - virtual Status SyncWAL() override { - return Status::OK(); - } - - virtual Status DisableFileDeletions() override { return Status::OK(); } - virtual Status EnableFileDeletions(bool force) override { - return Status::OK(); - } - virtual Status GetLiveFiles(std::vector&, uint64_t* size, - bool flush_memtable = true) override { - return Status::OK(); - } - - virtual Status GetSortedWalFiles(VectorLogPtr& files) override { - return Status::OK(); - } - - virtual Status DeleteFile(std::string name) override { return Status::OK(); } - - virtual Status GetDbIdentity(std::string& identity) const override { - return Status::OK(); - } - - virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; } - virtual Status GetUpdatesSince( - rocksdb::SequenceNumber, unique_ptr*, - const TransactionLogIterator::ReadOptions& - read_options = TransactionLogIterator::ReadOptions()) override { - return Status::NotSupported("Not supported in Model DB"); - } - - virtual ColumnFamilyHandle* DefaultColumnFamily() const override { - return nullptr; - } - - virtual void GetColumnFamilyMetaData( - ColumnFamilyHandle* column_family, - ColumnFamilyMetaData* metadata) override {} - - private: - class ModelIter: public Iterator { - public: - ModelIter(const KVMap* map, bool owned) - : map_(map), owned_(owned), iter_(map_->end()) { - } - ~ModelIter() { - if (owned_) delete map_; - } - virtual bool Valid() const override { return iter_ != map_->end(); } - virtual void SeekToFirst() override { iter_ = map_->begin(); } - virtual void SeekToLast() override { - if (map_->empty()) { - iter_ = map_->end(); - } else { - iter_ = map_->find(map_->rbegin()->first); - } - } - virtual void Seek(const Slice& k) override { - iter_ = map_->lower_bound(k.ToString()); - } - virtual void Next() override { ++iter_; } - virtual void Prev() override { - if (iter_ == map_->begin()) { - iter_ = map_->end(); - return; - } - --iter_; - } - - virtual Slice key() const override { return iter_->first; } - virtual Slice value() const override { return iter_->second; } - virtual Status status() const override { return Status::OK(); } - - private: - const KVMap* const map_; - const bool owned_; // Do we own map_ - KVMap::const_iterator iter_; - }; - const Options options_; - KVMap map_; - std::string name_ = ""; -}; - -static std::string RandomKey(Random* rnd, int minimum = 0) { - int len; - do { - len = (rnd->OneIn(3) - ? 1 // Short sometimes to encourage collisions - : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); - } while (len < minimum); - return test::RandomKey(rnd, len); -} - -static bool CompareIterators(int step, - DB* model, - DB* db, - const Snapshot* model_snap, - const Snapshot* db_snap) { - ReadOptions options; - options.snapshot = model_snap; - Iterator* miter = model->NewIterator(options); - options.snapshot = db_snap; - Iterator* dbiter = db->NewIterator(options); - bool ok = true; - int count = 0; - for (miter->SeekToFirst(), dbiter->SeekToFirst(); - ok && miter->Valid() && dbiter->Valid(); - miter->Next(), dbiter->Next()) { - count++; - if (miter->key().compare(dbiter->key()) != 0) { - fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(dbiter->key()).c_str()); - ok = false; - break; - } - - if (miter->value().compare(dbiter->value()) != 0) { - fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); - ok = false; - } - } - - if (ok) { - if (miter->Valid() != dbiter->Valid()) { - fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", - step, miter->Valid(), dbiter->Valid()); - ok = false; - } - } - delete miter; - delete dbiter; - return ok; -} - -TEST_F(DBTest, Randomized) { - anon::OptionsOverride options_override; - options_override.skip_policy = kSkipNoSnapshot; - Random rnd(test::RandomSeed()); - do { - ModelDB model(CurrentOptions(options_override)); - const int N = 10000; - const Snapshot* model_snap = nullptr; - const Snapshot* db_snap = nullptr; - std::string k, v; - for (int step = 0; step < N; step++) { - // TODO(sanjay): Test Get() works - int p = rnd.Uniform(100); - int minimum = 0; - if (option_config_ == kHashSkipList || - option_config_ == kHashLinkList || - option_config_ == kHashCuckoo || - option_config_ == kPlainTableFirstBytePrefix || - option_config_ == kBlockBasedTableWithWholeKeyHashIndex || - option_config_ == kBlockBasedTableWithPrefixHashIndex) { - minimum = 1; - } - if (p < 45) { // Put - k = RandomKey(&rnd, minimum); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - - } else if (p < 90) { // Delete - k = RandomKey(&rnd, minimum); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - - - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd, minimum); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); - } - } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - - if ((step % 100) == 0) { - // For DB instances that use the hash index + block-based table, the - // iterator will be invalid right when seeking a non-existent key, right - // than return a key that is close to it. - if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && - option_config_ != kBlockBasedTableWithPrefixHashIndex) { - ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - } - - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); - if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - - - auto options = CurrentOptions(options_override); - Reopen(options); - ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); - - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); - } - - if ((step % 2000) == 0) { - fprintf(stderr, - "DBTest.Randomized, option ID: %d, step: %d out of %d\n", - option_config_, step, N); - } - } - if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); - if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - // skip cuckoo hash as it does not support snapshot. - } while (ChangeOptions(kSkipDeletesFilterFirst | kSkipNoSeekToLast | - kSkipHashCuckoo)); -} - -TEST_F(DBTest, MultiGetSimple) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "k1", "v1")); - ASSERT_OK(Put(1, "k2", "v2")); - ASSERT_OK(Put(1, "k3", "v3")); - ASSERT_OK(Put(1, "k4", "v4")); - ASSERT_OK(Delete(1, "k4")); - ASSERT_OK(Put(1, "k5", "v5")); - ASSERT_OK(Delete(1, "no_key")); - - std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); - - std::vector values(20, "Temporary data to be overwritten"); - std::vector cfs(keys.size(), handles_[1]); - - std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ(values.size(), keys.size()); - ASSERT_EQ(values[0], "v1"); - ASSERT_EQ(values[1], "v2"); - ASSERT_EQ(values[2], "v3"); - ASSERT_EQ(values[4], "v5"); - - ASSERT_OK(s[0]); - ASSERT_OK(s[1]); - ASSERT_OK(s[2]); - ASSERT_TRUE(s[3].IsNotFound()); - ASSERT_OK(s[4]); - ASSERT_TRUE(s[5].IsNotFound()); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, MultiGetEmpty) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - // Empty Key Set - std::vector keys; - std::vector values; - std::vector cfs; - std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ(s.size(), 0U); - - // Empty Database, Empty Key Set - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ(s.size(), 0U); - - // Empty Database, Search for Keys - keys.resize(2); - keys[0] = "a"; - keys[1] = "b"; - cfs.push_back(handles_[0]); - cfs.push_back(handles_[1]); - s = db_->MultiGet(ReadOptions(), cfs, keys, &values); - ASSERT_EQ((int)s.size(), 2); - ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); - } while (ChangeCompactOptions()); -} - -namespace { -void PrefixScanInit(DBTest *dbtest) { - char buf[100]; - std::string keystr; - const int small_range_sstfiles = 5; - const int big_range_sstfiles = 5; - - // Generate 11 sst files with the following prefix ranges. - // GROUP 0: [0,10] (level 1) - // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) - // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) - // - // A seek with the previous API would do 11 random I/Os (to all the - // files). With the new API and a prefix filter enabled, we should - // only do 2 random I/O, to the 2 files containing the key. - - // GROUP 0 - snprintf(buf, sizeof(buf), "%02d______:start", 0); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - snprintf(buf, sizeof(buf), "%02d______:end", 10); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, - nullptr); // move to level 1 - - // GROUP 1 - for (int i = 1; i <= small_range_sstfiles; i++) { - snprintf(buf, sizeof(buf), "%02d______:start", i); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - snprintf(buf, sizeof(buf), "%02d______:end", i+1); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - } - - // GROUP 2 - for (int i = 1; i <= big_range_sstfiles; i++) { - snprintf(buf, sizeof(buf), "%02d______:start", 0); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - snprintf(buf, sizeof(buf), "%02d______:end", - small_range_sstfiles+i+1); - keystr = std::string(buf); - ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - } -} -} // namespace - -TEST_F(DBTest, PrefixScan) { - XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, - kSkipNoPrefix); - while (ChangeFilterOptions()) { - int count; - Slice prefix; - Slice key; - char buf[100]; - Iterator* iter; - snprintf(buf, sizeof(buf), "03______:"); - prefix = Slice(buf, 8); - key = Slice(buf, 9); - ASSERT_EQ(key.difference_offset(prefix), 8); - ASSERT_EQ(prefix.difference_offset(key), 8); - // db configs - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.disable_auto_compactions = true; - options.max_background_compactions = 2; - options.create_if_missing = true; - options.memtable_factory.reset(NewHashSkipListRepFactory(16)); - - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - table_options.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // 11 RAND I/Os - DestroyAndReopen(options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); - for (iter->Seek(prefix); iter->Valid(); iter->Next()) { - if (! iter->key().starts_with(prefix)) { - break; - } - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - Close(); - } // end of while - XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0); -} - -TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { - // create a DB with block prefix index - BlockBasedTableOptions table_options; - Options options = CurrentOptions(); - table_options.index_type = BlockBasedTableOptions::kHashSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - - - Reopen(options); - ASSERT_OK(Put("k1", "v1")); - Flush(); - ASSERT_OK(Put("k2", "v2")); - - // Reopen it without prefix extractor, make sure everything still works. - // RocksDB should just fall back to the binary index. - table_options.index_type = BlockBasedTableOptions::kBinarySearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.prefix_extractor.reset(); - - Reopen(options); - ASSERT_EQ("v1", Get("k1")); - ASSERT_EQ("v2", Get("k2")); -} - -TEST_F(DBTest, ChecksumTest) { - BlockBasedTableOptions table_options; - Options options = CurrentOptions(); - - table_options.checksum = kCRC32c; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_OK(Put("a", "b")); - ASSERT_OK(Put("c", "d")); - ASSERT_OK(Flush()); // table with crc checksum - - table_options.checksum = kxxHash; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_OK(Put("e", "f")); - ASSERT_OK(Put("g", "h")); - ASSERT_OK(Flush()); // table with xxhash checksum - - table_options.checksum = kCRC32c; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_EQ("b", Get("a")); - ASSERT_EQ("d", Get("c")); - ASSERT_EQ("f", Get("e")); - ASSERT_EQ("h", Get("g")); - - table_options.checksum = kCRC32c; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - ASSERT_EQ("b", Get("a")); - ASSERT_EQ("d", Get("c")); - ASSERT_EQ("f", Get("e")); - ASSERT_EQ("h", Get("g")); -} - -TEST_P(DBTestWithParam, FIFOCompactionTest) { - for (int iter = 0; iter < 2; ++iter) { - // first iteration -- auto compaction - // second iteration -- manual compaction - Options options; - options.compaction_style = kCompactionStyleFIFO; - options.write_buffer_size = 100 << 10; // 100KB - options.arena_block_size = 4096; - options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB - options.compression = kNoCompression; - options.create_if_missing = true; - options.max_subcompactions = max_subcompactions_; - if (iter == 1) { - options.disable_auto_compactions = true; - } - options = CurrentOptions(options); - DestroyAndReopen(options); - - Random rnd(301); - for (int i = 0; i < 6; ++i) { - for (int j = 0; j < 110; ++j) { - ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980))); - } - // flush should happen here - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - } - if (iter == 0) { - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - } else { - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - } - // only 5 files should survive - ASSERT_EQ(NumTableFilesAtLevel(0), 5); - for (int i = 0; i < 50; ++i) { - // these keys should be deleted in previous compaction - ASSERT_EQ("NOT_FOUND", Get(ToString(i))); - } - } -} - -// verify that we correctly deprecated timeout_hint_us -TEST_F(DBTest, SimpleWriteTimeoutTest) { - WriteOptions write_opt; - write_opt.timeout_hint_us = 0; - ASSERT_OK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); - write_opt.timeout_hint_us = 10; - ASSERT_NOK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); -} - -/* - * This test is not reliable enough as it heavily depends on disk behavior. - */ -TEST_F(DBTest, RateLimitingTest) { - Options options = CurrentOptions(); - options.write_buffer_size = 1 << 20; // 1MB - options.level0_file_num_compaction_trigger = 2; - options.target_file_size_base = 1 << 20; // 1MB - options.max_bytes_for_level_base = 4 << 20; // 4MB - options.max_bytes_for_level_multiplier = 4; - options.compression = kNoCompression; - options.create_if_missing = true; - options.env = env_; - options.IncreaseParallelism(4); - DestroyAndReopen(options); - - WriteOptions wo; - wo.disableWAL = true; - - // # no rate limiting - Random rnd(301); - uint64_t start = env_->NowMicros(); - // Write ~96M data - for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK(Put(RandomString(&rnd, 32), - RandomString(&rnd, (1 << 10) + 1), wo)); - } - uint64_t elapsed = env_->NowMicros() - start; - double raw_rate = env_->bytes_written_ * 1000000 / elapsed; - Close(); - - // # rate limiting with 0.7 x threshold - options.rate_limiter.reset( - NewGenericRateLimiter(static_cast(0.7 * raw_rate))); - env_->bytes_written_ = 0; - DestroyAndReopen(options); - - start = env_->NowMicros(); - // Write ~96M data - for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK(Put(RandomString(&rnd, 32), - RandomString(&rnd, (1 << 10) + 1), wo)); - } - elapsed = env_->NowMicros() - start; - Close(); - ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); - double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; - fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); - ASSERT_TRUE(ratio < 0.8); - - // # rate limiting with half of the raw_rate - options.rate_limiter.reset( - NewGenericRateLimiter(static_cast(raw_rate / 2))); - env_->bytes_written_ = 0; - DestroyAndReopen(options); - - start = env_->NowMicros(); - // Write ~96M data - for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK(Put(RandomString(&rnd, 32), - RandomString(&rnd, (1 << 10) + 1), wo)); - } - elapsed = env_->NowMicros() - start; - Close(); - ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); - ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; - fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); - ASSERT_LT(ratio, 0.6); -} - -TEST_F(DBTest, TableOptionsSanitizeTest) { - Options options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); - - options.table_factory.reset(new PlainTableFactory()); - options.prefix_extractor.reset(NewNoopTransform()); - Destroy(options); - ASSERT_TRUE(!TryReopen(options).IsNotSupported()); - - // Test for check of prefix_extractor when hash index is used for - // block-based table - BlockBasedTableOptions to; - to.index_type = BlockBasedTableOptions::kHashSearch; - options = CurrentOptions(); - options.create_if_missing = true; - options.table_factory.reset(NewBlockBasedTableFactory(to)); - ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - ASSERT_OK(TryReopen(options)); -} - -TEST_F(DBTest, SanitizeNumThreads) { - for (int attempt = 0; attempt < 2; attempt++) { - const size_t kTotalTasks = 8; - test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; - - Options options = CurrentOptions(); - if (attempt == 0) { - options.max_background_compactions = 3; - options.max_background_flushes = 2; - } - options.create_if_missing = true; - DestroyAndReopen(options); - - for (size_t i = 0; i < kTotalTasks; i++) { - // Insert 5 tasks to low priority queue and 5 tasks to high priority queue - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_tasks[i], - (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); - } - - // Wait 100 milliseconds for they are scheduled. - env_->SleepForMicroseconds(100000); - - // pool size 3, total task 4. Queue size should be 1. - ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); - // pool size 2, total task 4. Queue size should be 2. - ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); - - for (size_t i = 0; i < kTotalTasks; i++) { - sleeping_tasks[i].WakeUp(); - sleeping_tasks[i].WaitUntilDone(); - } - - ASSERT_OK(Put("abc", "def")); - ASSERT_EQ("def", Get("abc")); - Flush(); - ASSERT_EQ("def", Get("abc")); - } -} - -TEST_F(DBTest, DBIteratorBoundTest) { - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - - options.prefix_extractor = nullptr; - DestroyAndReopen(options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("g1", "0")); - - // testing basic case with no iterate_upper_bound and no prefix_extractor - { - ReadOptions ro; - ro.iterate_upper_bound = nullptr; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("g1")), 0); - } - - // testing iterate_upper_bound and forward iterator - // to make sure it stops at bound - { - ReadOptions ro; - // iterate_upper_bound points beyond the last expected entry - Slice prefix("foo2"); - ro.iterate_upper_bound = &prefix; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("foo1")), 0); - - iter->Next(); - // should stop here... - ASSERT_TRUE(!iter->Valid()); - } - // Testing SeekToLast with iterate_upper_bound set - { - ReadOptions ro; - - Slice prefix("foo"); - ro.iterate_upper_bound = &prefix; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->SeekToLast(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("a")), 0); - } - - // prefix is the first letter of the key - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - - DestroyAndReopen(options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("g1", "0")); - - // testing with iterate_upper_bound and prefix_extractor - // Seek target and iterate_upper_bound are not is same prefix - // This should be an error - { - ReadOptions ro; - Slice upper_bound("g"); - ro.iterate_upper_bound = &upper_bound; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo1", iter->key().ToString()); - - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - } - - // testing that iterate_upper_bound prevents iterating over deleted items - // if the bound has already reached - { - options.prefix_extractor = nullptr; - DestroyAndReopen(options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("b", "0")); - ASSERT_OK(Put("b1", "0")); - ASSERT_OK(Put("c", "0")); - ASSERT_OK(Put("d", "0")); - ASSERT_OK(Put("e", "0")); - ASSERT_OK(Delete("c")); - ASSERT_OK(Delete("d")); - - // base case with no bound - ReadOptions ro; - ro.iterate_upper_bound = nullptr; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("b"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("b")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("b1")), 0); - - perf_context.Reset(); - iter->Next(); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); - - // now testing with iterate_bound - Slice prefix("c"); - ro.iterate_upper_bound = &prefix; - - iter.reset(db_->NewIterator(ro)); - - perf_context.Reset(); - - iter->Seek("b"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("b")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("b1")), 0); - - iter->Next(); - // the iteration should stop as soon as the the bound key is reached - // even though the key is deleted - // hence internal_delete_skipped_count should be 0 - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); - } -} - -TEST_F(DBTest, WriteSingleThreadEntry) { - std::vector threads; - dbfull()->TEST_LockMutex(); - auto w = dbfull()->TEST_BeginWrite(); - threads.emplace_back([&] { Put("a", "b"); }); - env_->SleepForMicroseconds(10000); - threads.emplace_back([&] { Flush(); }); - env_->SleepForMicroseconds(10000); - dbfull()->TEST_UnlockMutex(); - dbfull()->TEST_LockMutex(); - dbfull()->TEST_EndWrite(w); - dbfull()->TEST_UnlockMutex(); - - for (auto& t : threads) { - t.join(); - } -} - -TEST_F(DBTest, DisableDataSyncTest) { - env_->sync_counter_.store(0); - // iter 0 -- no sync - // iter 1 -- sync - for (int iter = 0; iter < 2; ++iter) { - Options options = CurrentOptions(); - options.disableDataSync = iter == 0; - options.create_if_missing = true; - options.num_levels = 10; - options.env = env_; - Reopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - - MakeTables(10, "a", "z"); - Compact("a", "z"); - - if (iter == 0) { - ASSERT_EQ(env_->sync_counter_.load(), 0); - } else { - ASSERT_GT(env_->sync_counter_.load(), 0); - } - Destroy(options); - } -} - -TEST_F(DBTest, DynamicMemtableOptions) { - const uint64_t k64KB = 1 << 16; - const uint64_t k128KB = 1 << 17; - const uint64_t k5KB = 5 * 1024; - const int kNumPutsBeforeWaitForFlush = 64; - Options options; - options.env = env_; - options.create_if_missing = true; - options.compression = kNoCompression; - options.max_background_compactions = 1; - options.write_buffer_size = k64KB; - options.arena_block_size = 16 * 1024; - options.max_write_buffer_number = 2; - // Don't trigger compact/slowdown/stop - options.level0_file_num_compaction_trigger = 1024; - options.level0_slowdown_writes_trigger = 1024; - options.level0_stop_writes_trigger = 1024; - DestroyAndReopen(options); - - auto gen_l0_kb = [this, kNumPutsBeforeWaitForFlush](int size) { - Random rnd(301); - for (int i = 0; i < size; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - - // The following condition prevents a race condition between flush jobs - // acquiring work and this thread filling up multiple memtables. Without - // this, the flush might produce less files than expected because - // multiple memtables are flushed into a single L0 file. This race - // condition affects assertion (A). - if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { - dbfull()->TEST_WaitForFlushMemTable(); - } - } - dbfull()->TEST_WaitForFlushMemTable(); - }; - - // Test write_buffer_size - gen_l0_kb(64); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); - ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); - - // Clean up L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - - // Increase buffer size - ASSERT_OK(dbfull()->SetOptions({ - {"write_buffer_size", "131072"}, - })); - - // The existing memtable is still 64KB in size, after it becomes immutable, - // the next memtable will be 128KB in size. Write 256KB total, we should - // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data - gen_l0_kb(256); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A) - ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); - ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); - - // Test max_write_buffer_number - // Block compaction thread, which will also block the flushes because - // max_background_flushes == 0, so flushes are getting executed by the - // compaction thread - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - // Start from scratch and disable compaction/flush. Flush can only happen - // during compaction but trigger is pretty high - options.max_background_flushes = 0; - options.disable_auto_compactions = true; - DestroyAndReopen(options); - - // Put until writes are stopped, bounded by 256 puts. We should see stop at - // ~128KB - int count = 0; - Random rnd(301); - - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DelayWrite:Wait", - [&](void* arg) { sleeping_task_low.WakeUp(); }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - - while (!sleeping_task_low.WokenUp() && count < 256) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); - count++; - } - ASSERT_GT(static_cast(count), 128 * 0.8); - ASSERT_LT(static_cast(count), 128 * 1.2); - - sleeping_task_low.WaitUntilDone(); - - // Increase - ASSERT_OK(dbfull()->SetOptions({ - {"max_write_buffer_number", "8"}, - })); - // Clean up memtable and L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - count = 0; - while (!sleeping_task_low.WokenUp() && count < 1024) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); - count++; - } - // Windows fails this test. Will tune in the future and figure out - // approp number -#ifndef OS_WIN - ASSERT_GT(static_cast(count), 512 * 0.8); - ASSERT_LT(static_cast(count), 512 * 1.2); -#endif - sleeping_task_low.WaitUntilDone(); - - // Decrease - ASSERT_OK(dbfull()->SetOptions({ - {"max_write_buffer_number", "4"}, - })); - // Clean up memtable and L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - - count = 0; - while (!sleeping_task_low.WokenUp() && count < 1024) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); - count++; - } - // Windows fails this test. Will tune in the future and figure out - // approp number -#ifndef OS_WIN - ASSERT_GT(static_cast(count), 256 * 0.8); - ASSERT_LT(static_cast(count), 266 * 1.2); -#endif - sleeping_task_low.WaitUntilDone(); - - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); -} - -#if ROCKSDB_USING_THREAD_STATUS -namespace { -void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, - int expected_count) { - int op_count = 0; - std::vector thread_list; - ASSERT_OK(env->GetThreadList(&thread_list)); - for (auto thread : thread_list) { - if (thread.operation_type == op_type) { - op_count++; - } - } - ASSERT_EQ(op_count, expected_count); -} -} // namespace - -TEST_F(DBTest, GetThreadStatus) { - Options options; - options.env = env_; - options.enable_thread_tracking = true; - TryReopen(options); +TEST_F(DBTest, GroupCommitTest) { + do { + Options options = CurrentOptions(); + options.env = env_; + env_->log_write_slowdown_.store(100); + options.statistics = rocksdb::CreateDBStatistics(); + Reopen(options); - std::vector thread_list; - Status s = env_->GetThreadList(&thread_list); + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } - for (int i = 0; i < 2; ++i) { - // repeat the test with differet number of high / low priority threads - const int kTestCount = 3; - const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; - const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; - for (int test = 0; test < kTestCount; ++test) { - // Change the number of threads in high / low priority pool. - env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); - env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); - // Wait to ensure the all threads has been registered - env_->SleepForMicroseconds(100000); - s = env_->GetThreadList(&thread_list); - ASSERT_OK(s); - unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; - memset(thread_type_counts, 0, sizeof(thread_type_counts)); - for (auto thread : thread_list) { - ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); - thread_type_counts[thread.thread_type]++; + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); } - // Verify the total number of threades - ASSERT_EQ( - thread_type_counts[ThreadStatus::HIGH_PRIORITY] + - thread_type_counts[ThreadStatus::LOW_PRIORITY], - kHighPriCounts[test] + kLowPriCounts[test]); - // Verify the number of high-priority threads - ASSERT_EQ( - thread_type_counts[ThreadStatus::HIGH_PRIORITY], - kHighPriCounts[test]); - // Verify the number of low-priority threads - ASSERT_EQ( - thread_type_counts[ThreadStatus::LOW_PRIORITY], - kLowPriCounts[test]); - } - if (i == 0) { - // repeat the test with multiple column families - CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, true); } - } - db_->DropColumnFamily(handles_[2]); - delete handles_[2]; - handles_.erase(handles_.begin() + 2); - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, true); - Close(); - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, true); -} - -TEST_F(DBTest, DisableThreadStatus) { - Options options; - options.env = env_; - options.enable_thread_tracking = false; - TryReopen(options); - CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); - // Verify non of the column family info exists - env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( - handles_, false); -} - -TEST_F(DBTest, ThreadStatusFlush) { - Options options; - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options.enable_thread_tracking = true; - options = CurrentOptions(options); - - rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"}, - {"DBTest::ThreadStatusFlush:2", - "FlushJob::LogAndNotifyTableFileCreation()"}, - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - - CreateAndReopenWithCF({"pikachu"}, options); - VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); - - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); - - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - Put(1, "k2", std::string(100000, 'y')); // Trigger flush - - // The first sync point is to make sure there's one flush job - // running when we perform VerifyOperationCount(). - TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); - VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); - // This second sync point is to ensure the flush job will not - // be completed until we already perform VerifyOperationCount(). - TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2"); - - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); -} - -TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) { - const int kTestKeySize = 16; - const int kTestValueSize = 984; - const int kEntrySize = kTestKeySize + kTestValueSize; - const int kEntriesPerBuffer = 100; - Options options; - options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; - options.compaction_style = kCompactionStyleLevel; - options.target_file_size_base = options.write_buffer_size; - options.max_bytes_for_level_base = options.target_file_size_base * 2; - options.max_bytes_for_level_multiplier = 2; - options.compression = kNoCompression; - options = CurrentOptions(options); - options.env = env_; - options.enable_thread_tracking = true; - const int kNumL0Files = 4; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.max_subcompactions = max_subcompactions_; + env_->log_write_slowdown_.store(0); - rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"}, - {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"}, - {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"}, - }); - for (int tests = 0; tests < 2; ++tests) { - DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->ClearTrace(); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); - Random rnd(301); - // The Put Phase. - for (int file = 0; file < kNumL0Files; ++file) { - for (int key = 0; key < kEntriesPerBuffer; ++key) { - ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer), - RandomString(&rnd, kTestValueSize))); - } - Flush(); + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(ToString(i)); } - // This makes sure a compaction won't be scheduled until - // we have done with the above Put Phase. - TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); - ASSERT_GE(NumTableFilesAtLevel(0), - options.level0_file_num_compaction_trigger); - - // This makes sure at least one compaction is running. - TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1"); + std::sort(expected_db.begin(), expected_db.end()); - if (options.enable_thread_tracking) { - // expecting one single L0 to L1 compaction - VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1); - } else { - // If thread tracking is not enabled, compaction count should be 0. - VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); } - // TODO(yhchiang): adding assert to verify each compaction stage. - TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); + ASSERT_TRUE(!itr->Valid()); + delete itr; - // repeat the test with disabling thread tracking. - options.enable_thread_tracking = false; - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - } + HistogramData hist_data = {0, 0, 0, 0, 0}; + options.statistics->histogramData(DB_WRITE, &hist_data); + ASSERT_GT(hist_data.average, 0.0); + } while (ChangeOptions(kSkipNoSeekToLast)); } -TEST_P(DBTestWithParam, PreShutdownManualCompaction) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - options.max_subcompactions = max_subcompactions_; - CreateAndReopenWithCF({"pikachu"}, options); - - // iter - 0 with 7 levels - // iter - 1 with 3 levels - for (int iter = 0; iter < 2; ++iter) { - MakeTables(3, "p", "q", 1); - ASSERT_EQ("1,1,1", FilesPerLevel(1)); - - // Compaction range falls before files - Compact(1, "", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel(1)); - - // Compaction range falls after files - Compact(1, "r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel(1)); - - // Compaction range overlaps files - Compact(1, "p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel(1)); - - // Populate a different range - MakeTables(3, "c", "e", 1); - ASSERT_EQ("1,1,2", FilesPerLevel(1)); - - // Compact just the new range - Compact(1, "b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel(1)); +namespace { +typedef std::map KVMap; +} - // Compact all - MakeTables(1, "a", "z", 1); - ASSERT_EQ("1,0,2", FilesPerLevel(1)); - CancelAllBackgroundWork(db_); - db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); - ASSERT_EQ("1,0,2", FilesPerLevel(1)); +class ModelDB : public DB { + public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; - if (iter == 0) { - options = CurrentOptions(); - options.max_background_flushes = 0; - options.num_levels = 3; - options.create_if_missing = true; - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); + virtual SequenceNumber GetSequenceNumber() const override { + // no need to call this + assert(false); + return 0; } + }; + + explicit ModelDB(const Options& options) : options_(options) {} + using DB::Put; + virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) override { + WriteBatch batch; + batch.Put(cf, k, v); + return Write(o, &batch); + } + using DB::Delete; + virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + batch.Delete(cf, key); + return Write(o, &batch); + } + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + batch.SingleDelete(cf, key); + return Write(o, &batch); + } + using DB::Merge; + virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) override { + WriteBatch batch; + batch.Merge(cf, k, v); + return Write(o, &batch); + } + using DB::Get; + virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, + const Slice& key, std::string* value) override { + return Status::NotSupported(key); } -} - -TEST_F(DBTest, PreShutdownFlush) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(Put(1, "key", "value")); - CancelAllBackgroundWork(db_); - Status s = - db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); - ASSERT_TRUE(s.IsShutdownInProgress()); -} - -TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { - const int kTestKeySize = 16; - const int kTestValueSize = 984; - const int kEntrySize = kTestKeySize + kTestValueSize; - const int kEntriesPerBuffer = 40; - const int kNumL0Files = 4; - - const int kHighPriCount = 3; - const int kLowPriCount = 5; - env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); - env_->SetBackgroundThreads(kLowPriCount, Env::LOW); - Options options; - options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; - options.compaction_style = kCompactionStyleLevel; - options.target_file_size_base = options.write_buffer_size; - options.max_bytes_for_level_base = - options.target_file_size_base * kNumL0Files; - options.compression = kNoCompression; - options = CurrentOptions(options); - options.env = env_; - options.enable_thread_tracking = true; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.max_bytes_for_level_multiplier = 2; - options.max_background_compactions = kLowPriCount; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.max_subcompactions = max_subcompactions_; + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } - TryReopen(options); - Random rnd(301); +#ifndef ROCKSDB_LITE + using DB::AddFile; + virtual Status AddFile(ColumnFamilyHandle* column_family, + const std::vector& file_info_list, + bool move_file) override { + return Status::NotSupported("Not implemented."); + } + virtual Status AddFile(ColumnFamilyHandle* column_family, + const std::vector& file_path_list, + bool move_file) override { + return Status::NotSupported("Not implemented."); + } - std::vector thread_list; - // Delay both flush and compaction - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"}, - {"CompactionJob::Run():Start", - "DBTest::PreShutdownMultipleCompaction:Preshutdown"}, - {"CompactionJob::Run():Start", - "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"}, - {"DBTest::PreShutdownMultipleCompaction:Preshutdown", - "CompactionJob::Run():End"}, - {"CompactionJob::Run():End", - "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}}); + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override { + return Status(); + } - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override { + return Status(); + } +#endif // ROCKSDB_LITE - // Make rocksdb busy - int key = 0; - // check how many threads are doing compaction using GetThreadList - int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; - for (int file = 0; file < 16 * kNumL0Files; ++file) { - for (int k = 0; k < kEntriesPerBuffer; ++k) { - ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + if (value_found != nullptr) { + *value_found = false; } - - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; + return true; // Not Supported directly + } + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); } + } + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_family, + std::vector* iterators) override { + return Status::NotSupported("Not supported yet"); + } + virtual const Snapshot* GetSnapshot() override { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; + } - // Speed up the test - if (operation_count[ThreadStatus::OP_FLUSH] > 1 && - operation_count[ThreadStatus::OP_COMPACTION] > - 0.6 * options.max_background_compactions) { - break; - } - if (file == 15 * kNumL0Files) { - TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); - } + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + delete reinterpret_cast(snapshot); } - TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); - ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); - CancelAllBackgroundWork(db_); - TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); - dbfull()->TEST_WaitForCompact(); - // Record the number of compactions at a time. - for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { - operation_count[i] = 0; + virtual Status Write(const WriteOptions& options, + WriteBatch* batch) override { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + virtual void Put(const Slice& key, const Slice& value) override { + (*map_)[key.ToString()] = value.ToString(); + } + virtual void Merge(const Slice& key, const Slice& value) override { + // ignore merge for now + // (*map_)[key.ToString()] = value.ToString(); + } + virtual void Delete(const Slice& key) override { + map_->erase(key.ToString()); + } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; + + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return false; + } + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override { + return false; + } + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { + return false; + } + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes, + bool include_memtable) override { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* start, const Slice* end) override { + return Status::NotSupported("Not supported operation."); } - ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); -} -TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { - const int kTestKeySize = 16; - const int kTestValueSize = 984; - const int kEntrySize = kTestKeySize + kTestValueSize; - const int kEntriesPerBuffer = 40; - const int kNumL0Files = 4; + using DB::CompactFiles; + virtual Status CompactFiles(const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, + const int output_path_id = -1) override { + return Status::NotSupported("Not supported operation."); + } - const int kHighPriCount = 3; - const int kLowPriCount = 5; - env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); - env_->SetBackgroundThreads(kLowPriCount, Env::LOW); + Status PauseBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } - Options options; - options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; - options.compaction_style = kCompactionStyleLevel; - options.target_file_size_base = options.write_buffer_size; - options.max_bytes_for_level_base = - options.target_file_size_base * kNumL0Files; - options.compression = kNoCompression; - options = CurrentOptions(options); - options.env = env_; - options.enable_thread_tracking = true; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.max_bytes_for_level_multiplier = 2; - options.max_background_compactions = kLowPriCount; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.max_subcompactions = max_subcompactions_; + Status ContinueBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } - TryReopen(options); - Random rnd(301); + Status EnableAutoCompaction( + const std::vector& column_family_handles) override { + return Status::NotSupported("Not supported operation."); + } - std::vector thread_list; - // Delay both flush and compaction - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBTest::PreShutdownCompactionMiddle:Preshutdown", - "CompactionJob::Run():Inprogress"}, - {"CompactionJob::Run():Start", - "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"}, - {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"}, - {"CompactionJob::Run():End", - "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}}); + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return 1; + } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel( + ColumnFamilyHandle* column_family) override { + return 1; + } - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override { + return -1; + } - // Make rocksdb busy - int key = 0; - // check how many threads are doing compaction using GetThreadList - int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; - for (int file = 0; file < 16 * kNumL0Files; ++file) { - for (int k = 0; k < kEntriesPerBuffer; ++k) { - ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); - } + virtual const std::string& GetName() const override { return name_; } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; - } + virtual Env* GetEnv() const override { return nullptr; } - // Speed up the test - if (operation_count[ThreadStatus::OP_FLUSH] > 1 && - operation_count[ThreadStatus::OP_COMPACTION] > - 0.6 * options.max_background_compactions) { - break; - } - if (file == 15 * kNumL0Files) { - TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction"); - } + using DB::GetOptions; + virtual const Options& GetOptions( + ColumnFamilyHandle* column_family) const override { + return options_; } - ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); - CancelAllBackgroundWork(db_); - TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); - TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); - dbfull()->TEST_WaitForCompact(); - // Record the number of compactions at a time. - for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { - operation_count[i] = 0; - } - Status s = env_->GetThreadList(&thread_list); - for (auto thread : thread_list) { - operation_count[thread.operation_type]++; + using DB::GetDBOptions; + virtual const DBOptions& GetDBOptions() const override { return options_; } + + using DB::Flush; + virtual Status Flush(const rocksdb::FlushOptions& options, + ColumnFamilyHandle* column_family) override { + Status ret; + return ret; } - ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); -} -#endif // ROCKSDB_USING_THREAD_STATUS + virtual Status SyncWAL() override { return Status::OK(); } -TEST_F(DBTest, FlushOnDestroy) { - WriteOptions wo; - wo.disableWAL = true; - ASSERT_OK(Put("foo", "v1", wo)); - CancelAllBackgroundWork(db_); -} +#ifndef ROCKSDB_LITE + virtual Status DisableFileDeletions() override { return Status::OK(); } -namespace { -class OnFileDeletionListener : public EventListener { - public: - OnFileDeletionListener() : - matched_count_(0), - expected_file_name_("") {} + virtual Status EnableFileDeletions(bool force) override { + return Status::OK(); + } + virtual Status GetLiveFiles(std::vector&, uint64_t* size, + bool flush_memtable = true) override { + return Status::OK(); + } - void SetExpectedFileName( - const std::string file_name) { - expected_file_name_ = file_name; + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return Status::OK(); } - void VerifyMatchedCount(size_t expected_value) { - ASSERT_EQ(matched_count_, expected_value); + virtual Status DeleteFile(std::string name) override { return Status::OK(); } + + virtual Status GetUpdatesSince( + rocksdb::SequenceNumber, unique_ptr*, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) override { + return Status::NotSupported("Not supported in Model DB"); } - void OnTableFileDeleted( - const TableFileDeletionInfo& info) override { - if (expected_file_name_ != "") { - ASSERT_EQ(expected_file_name_, info.file_path); - expected_file_name_ = ""; - matched_count_++; - } + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override {} +#endif // ROCKSDB_LITE + + virtual Status GetDbIdentity(std::string& identity) const override { + return Status::OK(); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return nullptr; } private: - size_t matched_count_; - std::string expected_file_name_; + class ModelIter : public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) {} + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const override { return iter_ != map_->end(); } + virtual void SeekToFirst() override { iter_ = map_->begin(); } + virtual void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() override { ++iter_; } + virtual void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + virtual Slice key() const override { return iter_->first; } + virtual Slice value() const override { return iter_->second; } + virtual Status status() const override { return Status::OK(); } + + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; }; -} // namespace +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} -TEST_F(DBTest, DynamicLevelCompressionPerLevel) { - if (!Snappy_Supported()) { - return; +static bool CompareIterators(int step, DB* model, DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } } - const int kNKeys = 120; - int keys[kNKeys]; - for (int i = 0; i < kNKeys; i++) { - keys[i] = i; + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } } - std::random_shuffle(std::begin(keys), std::end(keys)); + delete miter; + delete dbiter; + return ok; +} - Random rnd(301); - Options options; - options.create_if_missing = true; - options.db_write_buffer_size = 20480; - options.write_buffer_size = 20480; - options.max_write_buffer_number = 2; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 2; - options.target_file_size_base = 2048; - options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 102400; - options.max_bytes_for_level_multiplier = 4; - options.max_background_compactions = 1; - options.num_levels = 5; +class DBTestRandomized : public DBTest, + public ::testing::WithParamInterface { + public: + virtual void SetUp() override { option_config_ = GetParam(); } - options.compression_per_level.resize(3); - options.compression_per_level[0] = kNoCompression; - options.compression_per_level[1] = kNoCompression; - options.compression_per_level[2] = kSnappyCompression; + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + // skip cuckoo hash as it does not support snapshot. + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, kSkipDeletesFilterFirst | + kSkipNoSeekToLast | + kSkipHashCuckoo)) { + option_configs.push_back(option_config); + } + } + option_configs.push_back(kBlockBasedTableWithIndexRestartInterval); + return option_configs; + } +}; - OnFileDeletionListener* listener = new OnFileDeletionListener(); - options.listeners.emplace_back(listener); +INSTANTIATE_TEST_CASE_P( + DBTestRandomized, DBTestRandomized, + ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs())); +TEST_P(DBTestRandomized, Randomized) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); DestroyAndReopen(options); - // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should - // be compressed, so total data size should be more than 80K. - for (int i = 0; i < 20; i++) { - ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); - } - Flush(); - dbfull()->TEST_WaitForCompact(); + Random rnd(test::RandomSeed() + GetParam()); + ModelDB model(options); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList || option_config_ == kHashLinkList || + option_config_ == kHashCuckoo || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex || + option_config_ == kBlockBasedTableWithPrefixHashIndex) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = RandomString(&rnd, + rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_EQ(NumTableFilesAtLevel(3), 0); - ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U); + if ((step % 100) == 0) { + // For DB instances that use the hash index + block-based table, the + // iterator will be invalid right when seeking a non-existent key, right + // than return a key that is close to it. + if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && + option_config_ != kBlockBasedTableWithPrefixHashIndex) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + } - // Insert 400KB. Some data will be compressed - for (int i = 21; i < 120; i++) { - ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); - } - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U); - // Make sure data in files in L3 is not compacted by removing all files - // in L4 and calculate number of rows - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "true"}, - })); - ColumnFamilyMetaData cf_meta; - db_->GetColumnFamilyMetaData(&cf_meta); - for (auto file : cf_meta.levels[4].files) { - listener->SetExpectedFileName(dbname_ + file.name); - ASSERT_OK(dbfull()->DeleteFile(file.name)); - } - listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - int num_keys = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - num_keys++; + Reopen(options); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } } - ASSERT_OK(iter->status()); - ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U); + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); } -TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { - if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { - return; - } - const int kNKeys = 500; - int keys[kNKeys]; - for (int i = 0; i < kNKeys; i++) { - keys[i] = i; - } - std::random_shuffle(std::begin(keys), std::end(keys)); +TEST_F(DBTest, MultiGetSimple) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); - Random rnd(301); - Options options; - options.create_if_missing = true; - options.db_write_buffer_size = 6000; - options.write_buffer_size = 6000; - options.max_write_buffer_number = 2; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 2; - options.soft_rate_limit = 1.1; + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); - // Use file size to distinguish levels - // L1: 10, L2: 20, L3 40, L4 80 - // L0 is less than 30 - options.target_file_size_base = 10; - options.target_file_size_multiplier = 2; + std::vector values(20, "Temporary data to be overwritten"); + std::vector cfs(keys.size(), handles_[1]); - options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 200; - options.max_bytes_for_level_multiplier = 8; - options.max_background_compactions = 1; - options.num_levels = 5; - std::shared_ptr mtf(new mock::MockTableFactory); - options.table_factory = mtf; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); - options.compression_per_level.resize(3); - options.compression_per_level[0] = kNoCompression; - options.compression_per_level[1] = kLZ4Compression; - options.compression_per_level[2] = kZlibCompression; + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + } while (ChangeCompactOptions()); +} - DestroyAndReopen(options); - // When base level is L4, L4 is LZ4. - std::atomic num_zlib(0); - std::atomic num_lz4(0); - std::atomic num_no(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); - if (compaction->output_level() == 4) { - ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); - num_lz4.fetch_add(1); - } - }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = reinterpret_cast(arg); - ASSERT_TRUE(*compression == kNoCompression); - num_no.fetch_add(1); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +TEST_F(DBTest, MultiGetEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Empty Key Set + std::vector keys; + std::vector values; + std::vector cfs; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); - } - Flush(); - dbfull()->TEST_WaitForCompact(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + // Empty Database, Empty Key Set + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_EQ(NumTableFilesAtLevel(3), 0); - ASSERT_GT(NumTableFilesAtLevel(4), 0); - ASSERT_GT(num_no.load(), 2); - ASSERT_GT(num_lz4.load(), 0); - int prev_num_files_l4 = NumTableFilesAtLevel(4); + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + cfs.push_back(handles_[0]); + cfs.push_back(handles_[1]); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(static_cast(s.size()), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); +} - // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib - num_lz4.store(0); - num_no.store(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); - if (compaction->output_level() == 4 && compaction->start_level() == 3) { - ASSERT_TRUE(compaction->output_compression() == kZlibCompression); - num_zlib.fetch_add(1); - } else { - ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); - num_lz4.fetch_add(1); - } - }); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = reinterpret_cast(arg); - ASSERT_TRUE(*compression == kNoCompression); - num_no.fetch_add(1); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - for (int i = 101; i < 500; i++) { - ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); - if (i % 100 == 99) { - Flush(); - dbfull()->TEST_WaitForCompact(); - } - } + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + Flush(); + ASSERT_OK(Put("k2", "v2")); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_GT(NumTableFilesAtLevel(3), 0); - ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); - ASSERT_GT(num_no.load(), 2); - ASSERT_GT(num_lz4.load(), 0); - ASSERT_GT(num_zlib.load(), 0); + // Reopen it without prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(); + + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); } -TEST_F(DBTest, DynamicCompactionOptions) { - // minimum write buffer size is enforced at 64KB - const uint64_t k32KB = 1 << 15; - const uint64_t k64KB = 1 << 16; - const uint64_t k128KB = 1 << 17; - const uint64_t k1MB = 1 << 20; - const uint64_t k4KB = 1 << 12; - Options options; - options.env = env_; - options.create_if_missing = true; - options.compression = kNoCompression; - options.soft_rate_limit = 1.1; - options.write_buffer_size = k64KB; - options.arena_block_size = 4 * k4KB; - options.max_write_buffer_number = 2; - // Compaction related options - options.level0_file_num_compaction_trigger = 3; - options.level0_slowdown_writes_trigger = 4; - options.level0_stop_writes_trigger = 8; - options.max_grandparent_overlap_factor = 10; - options.expanded_compaction_factor = 25; - options.source_compaction_factor = 1; - options.target_file_size_base = k64KB; - options.target_file_size_multiplier = 1; - options.max_bytes_for_level_base = k128KB; - options.max_bytes_for_level_multiplier = 4; +TEST_F(DBTest, ChecksumTest) { + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); - // Block flush thread and disable compaction thread - env_->SetBackgroundThreads(1, Env::LOW); - env_->SetBackgroundThreads(1, Env::HIGH); - DestroyAndReopen(options); + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Flush()); // table with crc checksum + + table_options.checksum = kxxHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("e", "f")); + ASSERT_OK(Put("g", "h")); + ASSERT_OK(Flush()); // table with xxhash checksum + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); +} + +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, FIFOCompactionTest) { + for (int iter = 0; iter < 2; ++iter) { + // first iteration -- auto compaction + // second iteration -- manual compaction + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.compression = kNoCompression; + options.create_if_missing = true; + options.max_subcompactions = max_subcompactions_; + if (iter == 1) { + options.disable_auto_compactions = true; + } + options = CurrentOptions(options); + DestroyAndReopen(options); - auto gen_l0_kb = [this](int start, int size, int stride) { Random rnd(301); - for (int i = 0; i < size; i++) { - ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 110; ++j) { + ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980))); + } + // flush should happen here + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForFlushMemTable(); - }; - - // Write 3 files that have the same key range. - // Since level0_file_num_compaction_trigger is 3, compaction should be - // triggered. The compaction should result in one L1 file - gen_l0_kb(0, 64, 1); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - gen_l0_kb(0, 64, 1); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); - gen_l0_kb(0, 64, 1); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,1", FilesPerLevel()); - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(1U, metadata.size()); - ASSERT_LE(metadata[0].size, k64KB + k4KB); - ASSERT_GE(metadata[0].size, k64KB - k4KB); + if (iter == 0) { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } else { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // only 5 files should survive + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + for (int i = 0; i < 50; ++i) { + // these keys should be deleted in previous compaction + ASSERT_EQ("NOT_FOUND", Get(ToString(i))); + } + } +} +#endif // ROCKSDB_LITE - // Test compaction trigger and target_file_size_base - // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. - // Writing to 64KB L0 files should trigger a compaction. Since these - // 2 L0 files have the same key range, compaction merge them and should - // result in 2 32KB L1 files. - ASSERT_OK(dbfull()->SetOptions({ - {"level0_file_num_compaction_trigger", "2"}, - {"target_file_size_base", ToString(k32KB) } - })); +// verify that we correctly deprecated timeout_hint_us +TEST_F(DBTest, SimpleWriteTimeoutTest) { + WriteOptions write_opt; + write_opt.timeout_hint_us = 0; + ASSERT_OK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); + write_opt.timeout_hint_us = 10; + ASSERT_NOK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt)); +} - gen_l0_kb(0, 64, 1); - ASSERT_EQ("1,1", FilesPerLevel()); - gen_l0_kb(0, 64, 1); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,2", FilesPerLevel()); - metadata.clear(); - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(2U, metadata.size()); - ASSERT_LE(metadata[0].size, k32KB + k4KB); - ASSERT_GE(metadata[0].size, k32KB - k4KB); - ASSERT_LE(metadata[1].size, k32KB + k4KB); - ASSERT_GE(metadata[1].size, k32KB - k4KB); +#ifndef ROCKSDB_LITE +/* + * This test is not reliable enough as it heavily depends on disk behavior. + */ +TEST_F(DBTest, RateLimitingTest) { + Options options = CurrentOptions(); + options.write_buffer_size = 1 << 20; // 1MB + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 1 << 20; // 1MB + options.max_bytes_for_level_base = 4 << 20; // 4MB + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options.env = env_; + options.IncreaseParallelism(4); + DestroyAndReopen(options); - // Test max_bytes_for_level_base - // Increase level base size to 256KB and write enough data that will - // fill L1 and L2. L1 size should be around 256KB while L2 size should be - // around 256KB x 4. - ASSERT_OK(dbfull()->SetOptions({ - {"max_bytes_for_level_base", ToString(k1MB) } - })); + WriteOptions wo; + wo.disableWAL = true; - // writing 96 x 64KB => 6 * 1024KB - // (L1 + L2) = (1 + 4) * 1024KB - for (int i = 0; i < 96; ++i) { - gen_l0_kb(i, 64, 96); + // # no rate limiting + Random rnd(301); + uint64_t start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); } - dbfull()->TEST_WaitForCompact(); - ASSERT_GT(SizeAtLevel(1), k1MB / 2); - ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); - - // Within (0.5, 1.5) of 4MB. - ASSERT_GT(SizeAtLevel(2), 2 * k1MB); - ASSERT_LT(SizeAtLevel(2), 6 * k1MB); + uint64_t elapsed = env_->NowMicros() - start; + double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed; + Close(); - // Test max_bytes_for_level_multiplier and - // max_bytes_for_level_base. Now, reduce both mulitplier and level base, - // After filling enough data that can fit in L1 - L3, we should see L1 size - // reduces to 128KB from 256KB which was asserted previously. Same for L2. - ASSERT_OK(dbfull()->SetOptions({ - {"max_bytes_for_level_multiplier", "2"}, - {"max_bytes_for_level_base", ToString(k128KB) } - })); + // # rate limiting with 0.7 x threshold + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(0.7 * raw_rate))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); - // writing 20 x 64KB = 10 x 128KB - // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB - for (int i = 0; i < 20; ++i) { - gen_l0_kb(i, 64, 32); + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); } - dbfull()->TEST_WaitForCompact(); - uint64_t total_size = - SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); - ASSERT_TRUE(total_size < k128KB * 7 * 1.5); - - // Test level0_stop_writes_trigger. - // Clean up memtable and L0. Block compaction threads. If continue to write - // and flush memtables. We should see put stop after 8 memtable flushes - // since level0_stop_writes_trigger = 8 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - // Block compaction - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + elapsed = env_->NowMicros() - start; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); + ASSERT_TRUE(ratio < 0.8); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DelayWrite:Wait", - [&](void* arg) { sleeping_task_low.WakeUp(); }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // # rate limiting with half of the raw_rate + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(raw_rate / 2))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - int count = 0; - Random rnd(301); - WriteOptions wo; - while (count < 64) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - if (sleeping_task_low.WokenUp()) { - break; - } - dbfull()->TEST_FlushMemTable(true); - count++; + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); } - // Stop trigger = 8 - ASSERT_EQ(count, 8); - // Unblock - sleeping_task_low.WaitUntilDone(); + elapsed = env_->NowMicros() - start; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); + ASSERT_LT(ratio, 0.6); +} - // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. - // Block compaction thread again. Perform the put and memtable flushes - // until we see the stop after 6 memtable flushes. - ASSERT_OK(dbfull()->SetOptions({ - {"level0_stop_writes_trigger", "6"} - })); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); +TEST_F(DBTest, TableOptionsSanitizeTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); - // Block compaction again - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - count = 0; - while (count < 64) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - if (sleeping_task_low.WokenUp()) { - break; - } - dbfull()->TEST_FlushMemTable(true); - count++; - } - ASSERT_EQ(count, 6); - // Unblock - sleeping_task_low.WaitUntilDone(); + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + Destroy(options); + ASSERT_TRUE(!TryReopen(options).IsNotSupported()); - // Test disable_auto_compactions - // Compaction thread is unblocked but auto compaction is disabled. Write - // 4 L0 files and compaction should be triggered. If auto compaction is - // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of - // L0 files do not change after the call. - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "true"} - })); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + // Test for check of prefix_extractor when hash index is used for + // block-based table + BlockBasedTableOptions to; + to.index_type = BlockBasedTableOptions::kHashSearch; + options = CurrentOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(to)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + ASSERT_OK(TryReopen(options)); +} - for (int i = 0; i < 4; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - // Wait for compaction so that put won't stop - dbfull()->TEST_FlushMemTable(true); - } - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(0), 4); +// On Windows you can have either memory mapped file or a file +// with unbuffered access. So this asserts and does not make +// sense to run +#ifndef OS_WIN +TEST_F(DBTest, MmapAndBufferOptions) { + Options options = CurrentOptions(); - // Enable auto compaction and perform the same test, # of L0 files should be - // reduced after compaction. - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "false"} - })); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + options.allow_os_buffer = false; + options.allow_mmap_reads = true; + ASSERT_NOK(TryReopen(options)); - for (int i = 0; i < 4; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - // Wait for compaction so that put won't stop - dbfull()->TEST_FlushMemTable(true); - } - dbfull()->TEST_WaitForCompact(); - ASSERT_LT(NumTableFilesAtLevel(0), 4); + // All other combinations are acceptable + options.allow_os_buffer = true; + ASSERT_OK(TryReopen(options)); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + options.allow_os_buffer = false; + options.allow_mmap_reads = false; + ASSERT_OK(TryReopen(options)); + + options.allow_os_buffer = true; + ASSERT_OK(TryReopen(options)); } +#endif -TEST_F(DBTest, FileCreationRandomFailure) { - Options options; - options.env = env_; +TEST_F(DBTest, ConcurrentMemtableNotSupported) { + Options options = CurrentOptions(); + options.allow_concurrent_memtable_write = true; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; options.create_if_missing = true; - options.write_buffer_size = 100000; // Small write buffer - options.target_file_size_base = 200000; - options.max_bytes_for_level_base = 1000000; - options.max_bytes_for_level_multiplier = 2; - DestroyAndReopen(options); - Random rnd(301); + DestroyDB(dbname_, options); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ASSERT_NOK(TryReopen(options)); - const int kCDTKeysPerBuffer = 4; - const int kTestSize = kCDTKeysPerBuffer * 4096; - const int kTotalIteration = 100; - // the second half of the test involves in random failure - // of file creation. - const int kRandomFailureTest = kTotalIteration / 2; - std::vector values; - for (int i = 0; i < kTestSize; ++i) { - values.push_back("NOT_FOUND"); - } - for (int j = 0; j < kTotalIteration; ++j) { - if (j == kRandomFailureTest) { - env_->non_writeable_rate_.store(90); - } - for (int k = 0; k < kTestSize; ++k) { - // here we expect some of the Put fails. - std::string value = RandomString(&rnd, 100); - Status s = Put(Key(k), Slice(value)); - if (s.ok()) { - // update the latest successful put - values[k] = value; - } - // But everything before we simulate the failure-test should succeed. - if (j < kRandomFailureTest) { - ASSERT_OK(s); - } - } - } + options.memtable_factory.reset(new SkipListFactory); + ASSERT_OK(TryReopen(options)); - // If rocksdb does not do the correct job, internal assert will fail here. - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ColumnFamilyOptions cf_options(options); + cf_options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); +} - // verify we have the latest successful update - for (int k = 0; k < kTestSize; ++k) { - auto v = Get(Key(k)); - ASSERT_EQ(v, values[k]); - } +#endif // ROCKSDB_LITE - // reopen and reverify we have the latest successful update - env_->non_writeable_rate_.store(0); - Reopen(options); - for (int k = 0; k < kTestSize; ++k) { - auto v = Get(Key(k)); - ASSERT_EQ(v, values[k]); - } -} +TEST_F(DBTest, SanitizeNumThreads) { + for (int attempt = 0; attempt < 2; attempt++) { + const size_t kTotalTasks = 8; + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; -TEST_F(DBTest, DynamicMiscOptions) { - // Test max_sequential_skip_in_iterations - Options options; - options.env = env_; - options.create_if_missing = true; - options.max_sequential_skip_in_iterations = 16; - options.compression = kNoCompression; - options.statistics = rocksdb::CreateDBStatistics(); - DestroyAndReopen(options); + Options options = CurrentOptions(); + if (attempt == 0) { + options.max_background_compactions = 3; + options.max_background_flushes = 2; + } + options.create_if_missing = true; + DestroyAndReopen(options); - auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { - int key0 = key_start; - int key1 = key_start + 1; - int key2 = key_start + 2; - Random rnd(301); - ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); - for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + for (size_t i = 0; i < kTotalTasks; i++) { + // Insert 5 tasks to low priority queue and 5 tasks to high priority queue + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], + (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); } - ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - iter->Seek(Key(key1)); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Key(key1)), 0); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Key(key2)), 0); - ASSERT_EQ(num_reseek, - TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); - }; - // No reseek - assert_reseek_count(100, 0); - ASSERT_OK(dbfull()->SetOptions({ - {"max_sequential_skip_in_iterations", "4"} - })); - // Clear memtable and make new option effective - dbfull()->TEST_FlushMemTable(true); - // Trigger reseek - assert_reseek_count(200, 1); + // Wait 100 milliseconds for they are scheduled. + env_->SleepForMicroseconds(100000); - ASSERT_OK(dbfull()->SetOptions({ - {"max_sequential_skip_in_iterations", "16"} - })); - // Clear memtable and make new option effective - dbfull()->TEST_FlushMemTable(true); - // No reseek - assert_reseek_count(300, 1); -} + // pool size 3, total task 4. Queue size should be 1. + ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); + // pool size 2, total task 4. Queue size should be 2. + ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); -TEST_F(DBTest, DontDeletePendingOutputs) { - Options options; - options.env = env_; - options.create_if_missing = true; - DestroyAndReopen(options); + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } - // Every time we write to a table file, call FOF/POF with full DB scan. This - // will make sure our pending_outputs_ protection work correctly - std::function purge_obsolete_files_function = [&]() { - JobContext job_context(0); - dbfull()->TEST_LockMutex(); - dbfull()->FindObsoleteFiles(&job_context, true /*force*/); - dbfull()->TEST_UnlockMutex(); - dbfull()->PurgeObsoleteFiles(job_context); - job_context.Clean(); - }; + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ("def", Get("abc")); + Flush(); + ASSERT_EQ("def", Get("abc")); + } +} - env_->table_write_callback_ = &purge_obsolete_files_function; +TEST_F(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { Put("a", "b"); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { Flush(); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); - for (int i = 0; i < 2; ++i) { - ASSERT_OK(Put("a", "begin")); - ASSERT_OK(Put("z", "end")); - ASSERT_OK(Flush()); + for (auto& t : threads) { + t.join(); } +} + +TEST_F(DBTest, DisableDataSyncTest) { + env_->sync_counter_.store(0); + // iter 0 -- no sync + // iter 1 -- sync + for (int iter = 0; iter < 2; ++iter) { + Options options = CurrentOptions(); + options.disableDataSync = iter == 0; + options.create_if_missing = true; + options.num_levels = 10; + options.env = env_; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + MakeTables(10, "a", "z"); + Compact("a", "z"); - // If pending output guard does not work correctly, PurgeObsoleteFiles() will - // delete the file that Compaction is trying to create, causing this: error - // db/db_test.cc:975: IO error: - // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory - Compact("a", "b"); + if (iter == 0) { + ASSERT_EQ(env_->sync_counter_.load(), 0); + } else { + ASSERT_GT(env_->sync_counter_.load(), 0); + } + Destroy(options); + } } -TEST_F(DBTest, DontDeleteMovedFile) { - // This test triggers move compaction and verifies that the file is not - // deleted when it's part of move compaction - Options options = CurrentOptions(); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DynamicMemtableOptions) { + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k5KB = 5 * 1024; + const int kNumPutsBeforeWaitForFlush = 64; + Options options; options.env = env_; options.create_if_missing = true; - options.max_bytes_for_level_base = 1024 * 1024; // 1 MB - options.level0_file_num_compaction_trigger = - 2; // trigger compaction when we have 2 files + options.compression = kNoCompression; + options.max_background_compactions = 1; + options.write_buffer_size = k64KB; + options.arena_block_size = 16 * 1024; + options.max_write_buffer_number = 2; + // Don't trigger compact/slowdown/stop + options.level0_file_num_compaction_trigger = 1024; + options.level0_slowdown_writes_trigger = 1024; + options.level0_stop_writes_trigger = 1024; DestroyAndReopen(options); - Random rnd(301); - // Create two 1MB sst files - for (int i = 0; i < 2; ++i) { - // Create 1MB sst file - for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + auto gen_l0_kb = [this, kNumPutsBeforeWaitForFlush](int size) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + + // The following condition prevents a race condition between flush jobs + // acquiring work and this thread filling up multiple memtables. Without + // this, the flush might produce less files than expected because + // multiple memtables are flushed into a single L0 file. This race + // condition affects assertion (A). + if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { + dbfull()->TEST_WaitForFlushMemTable(); + } } - ASSERT_OK(Flush()); - } - // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,0,1", FilesPerLevel(0)); + dbfull()->TEST_WaitForFlushMemTable(); + }; - // If the moved file is actually deleted (the move-safeguard in - // ~Version::Version() is not there), we get this failure: - // Corruption: Can't access /000009.sst - Reopen(options); -} + // Test write_buffer_size + gen_l0_kb(64); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); + ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); -TEST_F(DBTest, OptimizeFiltersForHits) { - Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; - options.arena_block_size = 4 * 1024; - options.target_file_size_base = 64 * 1024; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.max_bytes_for_level_base = 256 * 1024; - options.max_write_buffer_number = 2; - options.max_background_compactions = 8; - options.max_background_flushes = 8; - options.compression = kNoCompression; - options.compaction_style = kCompactionStyleLevel; - options.level_compaction_dynamic_level_bytes = true; - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy(10, true)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - options.optimize_filters_for_hits = true; - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"mypikachu"}, options); + // Clean up L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - int numkeys = 200000; + // Increase buffer size + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "131072"}, + })); - // Generate randomly shuffled keys, so the updates are almost - // random. - std::vector keys; - keys.reserve(numkeys); - for (int i = 0; i < numkeys; i += 2) { - keys.push_back(i); - } - std::random_shuffle(std::begin(keys), std::end(keys)); + // The existing memtable is still 64KB in size, after it becomes immutable, + // the next memtable will be 128KB in size. Write 256KB total, we should + // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data + gen_l0_kb(256); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A) + ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); + + // Test max_write_buffer_number + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + // Start from scratch and disable compaction/flush. Flush can only happen + // during compaction but trigger is pretty high + options.max_background_flushes = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); - int num_inserted = 0; - for (int key : keys) { - ASSERT_OK(Put(1, Key(key), "val")); - if (++num_inserted % 1000 == 0) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - } - } - ASSERT_OK(Put(1, Key(0), "val")); - ASSERT_OK(Put(1, Key(numkeys), "val")); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + // Put until writes are stopped, bounded by 256 puts. We should see stop at + // ~128KB + int count = 0; + Random rnd(301); - if (NumTableFilesAtLevel(0, 1) == 0) { - // No Level 0 file. Create one. - ASSERT_OK(Put(1, Key(0), "val")); - ASSERT_OK(Put(1, Key(numkeys), "val")); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - } + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* arg) { sleeping_task_low.WakeUp(); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - for (int i = 1; i < numkeys; i += 2) { - ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + while (!sleeping_task_low.WokenUp() && count < 256) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; } + ASSERT_GT(static_cast(count), 128 * 0.8); + ASSERT_LT(static_cast(count), 128 * 1.2); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + sleeping_task_low.WaitUntilDone(); - // Now we have three sorted run, L0, L5 and L6 with most files in L6 have - // no blooom filter. Most keys be checked bloom filters twice. - ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); - ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + // Increase + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "8"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - for (int i = 0; i < numkeys; i += 2) { - ASSERT_EQ(Get(1, Key(i)), "val"); + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; } -} - -TEST_F(DBTest, L0L1L2AndUpHitCounter) { - Options options = CurrentOptions(); - options.write_buffer_size = 32 * 1024; - options.target_file_size_base = 32 * 1024; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.max_bytes_for_level_base = 64 * 1024; - options.max_write_buffer_number = 2; - options.max_background_compactions = 8; - options.max_background_flushes = 8; - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"mypikachu"}, options); +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 512 * 0.8); + ASSERT_LT(static_cast(count), 512 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); - int numkeys = 20000; - for (int i = 0; i < numkeys; i++) { - ASSERT_OK(Put(1, Key(i), "val")); - } - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); - ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + // Decrease + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "4"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); - for (int i = 0; i < numkeys; i++) { - ASSERT_EQ(Get(1, Key(i)), "val"); + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 256 * 0.8); + ASSERT_LT(static_cast(count), 266 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); - - ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + - TestGetTickerCount(options, GET_HIT_L1) + - TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { - // iter 0 -- zlib - // iter 1 -- bzip2 - // iter 2 -- lz4 - // iter 3 -- lz4HC - CompressionType compressions[] = {kZlibCompression, kBZip2Compression, - kLZ4Compression, kLZ4HCCompression}; - for (int iter = 0; iter < 4; ++iter) { - if (!CompressionTypeSupported(compressions[iter])) { - continue; +#if ROCKSDB_USING_THREAD_STATUS +namespace { +void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, + int expected_count) { + int op_count = 0; + std::vector thread_list; + ASSERT_OK(env->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + if (thread.operation_type == op_type) { + op_count++; } - // first_table_version 1 -- generate with table_version == 1, read with - // table_version == 2 - // first_table_version 2 -- generate with table_version == 2, read with - // table_version == 1 - for (int first_table_version = 1; first_table_version <= 2; - ++first_table_version) { - BlockBasedTableOptions table_options; - table_options.format_version = first_table_version; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - Options options = CurrentOptions(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.create_if_missing = true; - options.compression = compressions[iter]; - DestroyAndReopen(options); + } + ASSERT_EQ(op_count, expected_count); +} +} // namespace - int kNumKeysWritten = 100000; +TEST_F(DBTest, GetThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = true; + TryReopen(options); - Random rnd(301); - for (int i = 0; i < kNumKeysWritten; ++i) { - // compressible string - ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); - } + std::vector thread_list; + Status s = env_->GetThreadList(&thread_list); - table_options.format_version = first_table_version == 1 ? 2 : 1; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); - ASSERT_EQ(r.substr(128), std::string(128, 'a')); + for (int i = 0; i < 2; ++i) { + // repeat the test with differet number of high / low priority threads + const int kTestCount = 3; + const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; + const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; + for (int test = 0; test < kTestCount; ++test) { + // Change the number of threads in high / low priority pool. + env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); + env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); + // Wait to ensure the all threads has been registered + env_->SleepForMicroseconds(100000); + s = env_->GetThreadList(&thread_list); + ASSERT_OK(s); + unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; + memset(thread_type_counts, 0, sizeof(thread_type_counts)); + for (auto thread : thread_list) { + ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); + thread_type_counts[thread.thread_type]++; } + // Verify the total number of threades + ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY] + + thread_type_counts[ThreadStatus::LOW_PRIORITY], + kHighPriCounts[test] + kLowPriCounts[test]); + // Verify the number of high-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY], + kHighPriCounts[test]); + // Verify the number of low-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY], + kLowPriCounts[test]); + } + if (i == 0) { + // repeat the test with multiple column families + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); } } + db_->DropColumnFamily(handles_[2]); + delete handles_[2]; + handles_.erase(handles_.begin() + 2); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + Close(); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); } -TEST_F(DBTest, MutexWaitStats) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, options); - const int64_t kMutexWaitDelay = 100; - ThreadStatusUtil::TEST_SetStateDelay( - ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay); - ASSERT_OK(Put("hello", "rocksdb")); - ASSERT_GE(TestGetTickerCount( - options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); - ThreadStatusUtil::TEST_SetStateDelay( - ThreadStatus::STATE_MUTEX_WAIT, 0); +TEST_F(DBTest, DisableThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = false; + TryReopen(options); + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + // Verify non of the column family info exists + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + false); } -// This reproduces a bug where we don't delete a file because when it was -// supposed to be deleted, it was blocked by pending_outputs -// Consider: -// 1. current file_number is 13 -// 2. compaction (1) starts, blocks deletion of all files starting with 13 -// (pending outputs) -// 3. file 13 is created by compaction (2) -// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file -// 13 has no references, it is put into VersionSet::obsolete_files_ -// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 -// is deleted from obsolete_files_ set. -// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by -// pending outputs since compaction (1) is still running. It is not deleted and -// it is not present in obsolete_files_ anymore. Therefore, we never delete it. -TEST_F(DBTest, DeleteObsoleteFilesPendingOutputs) { - Options options = CurrentOptions(); +TEST_F(DBTest, ThreadStatusFlush) { + Options options; options.env = env_; - options.write_buffer_size = 2 * 1024 * 1024; // 2 MB - options.max_bytes_for_level_base = 1024 * 1024; // 1 MB - options.level0_file_num_compaction_trigger = - 2; // trigger compaction when we have 2 files - options.max_background_flushes = 2; - options.max_background_compactions = 2; + options.write_buffer_size = 100000; // Small write buffer + options.enable_thread_tracking = true; + options = CurrentOptions(options); - OnFileDeletionListener* listener = new OnFileDeletionListener(); - options.listeners.emplace_back(listener); + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"}, + {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); - Random rnd(301); - // Create two 1MB sst files - for (int i = 0; i < 2; ++i) { - // Create 1MB sst file - for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); - } - ASSERT_OK(Flush()); - } - // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,0,1", FilesPerLevel(0)); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + + uint64_t num_running_flushes = 0; + db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_EQ(num_running_flushes, 0); + + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush - test::SleepingBackgroundTask blocking_thread; - port::Mutex mutex_; - bool already_blocked(false); + // The first sync point is to make sure there's one flush job + // running when we perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); + db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_EQ(num_running_flushes, 1); + // This second sync point is to ensure the flush job will not + // be completed until we already perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2"); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + const int kNumL0Files = 4; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_subcompactions = max_subcompactions_; + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"}, + {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"}, + {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"}, + }); + for (int tests = 0; tests < 2; ++tests) { + DestroyAndReopen(options); + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - // block the flush - std::function block_first_time = [&]() { - bool blocking = false; - { - MutexLock l(&mutex_); - if (!already_blocked) { - blocking = true; - already_blocked = true; + Random rnd(301); + // The Put Phase. + for (int file = 0; file < kNumL0Files; ++file) { + for (int key = 0; key < kEntriesPerBuffer; ++key) { + ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer), + RandomString(&rnd, kTestValueSize))); } + Flush(); } - if (blocking) { - blocking_thread.DoSleep(); - } - }; - env_->table_write_callback_ = &block_first_time; - // Create 1MB sst file - for (int j = 0; j < 256; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); - } - // this should trigger a flush, which is blocked with block_first_time - // pending_file is protecting all the files created after - - ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); - - ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(metadata.size(), 1U); - auto file_on_L2 = metadata[0].name; - listener->SetExpectedFileName(dbname_ + file_on_L2); - - ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, - true /* disallow trivial move */)); - ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + // This makes sure a compaction won't be scheduled until + // we have done with the above Put Phase. + uint64_t num_running_compactions = 0; + db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions); + ASSERT_EQ(num_running_compactions, 0); + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); - // finish the flush! - blocking_thread.WakeUp(); - blocking_thread.WaitUntilDone(); - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0)); + // This makes sure at least one compaction is running. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1"); - metadata.clear(); - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(metadata.size(), 2U); + if (options.enable_thread_tracking) { + // expecting one single L0 to L1 compaction + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1); + } else { + // If thread tracking is not enabled, compaction count should be 0. + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); + } + db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions); + ASSERT_EQ(num_running_compactions, 1); + // TODO(yhchiang): adding assert to verify each compaction stage. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); - // This file should have been deleted during last compaction - ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); - listener->VerifyMatchedCount(1); + // repeat the test with disabling thread tracking. + options.enable_thread_tracking = false; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } } -TEST_F(DBTest, CloseSpeedup) { +TEST_P(DBTestWithParam, PreShutdownManualCompaction) { Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleLevel; - options.write_buffer_size = 110 << 10; // 110KB - options.arena_block_size = 4 << 10; - options.level0_file_num_compaction_trigger = 2; - options.num_levels = 4; - options.max_bytes_for_level_base = 400 * 1024; - options.max_write_buffer_number = 16; + options.max_background_flushes = 0; + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); - // Block background threads - env_->SetBackgroundThreads(1, Env::LOW); - env_->SetBackgroundThreads(1, Env::HIGH); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); - std::vector filenames; - env_->GetChildren(dbname_, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - env_->DeleteDir(dbname_); - DestroyAndReopen(options); + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - env_->SetBackgroundThreads(1, Env::LOW); - env_->SetBackgroundThreads(1, Env::HIGH); - Random rnd(301); - int key_idx = 0; + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); - // First three 110KB files are not going to level 2 - // After that, (100K, 200K) - for (int num = 0; num < 5; num++) { - GenerateNewFile(&rnd, &key_idx, true); - } + // Compaction range overlaps files + Compact(1, "p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); - ASSERT_EQ(0, GetSstFileCount(dbname_)); + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); - Close(); - ASSERT_EQ(0, GetSstFileCount(dbname_)); + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); - // Unblock background threads - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + CancelAllBackgroundWork(db_); + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); - Destroy(options); + if (iter == 0) { + options = CurrentOptions(); + options.max_background_flushes = 0; + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } + } } -class DelayedMergeOperator : public AssociativeMergeOperator { - private: - DBTest* db_test_; - - public: - explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} - virtual bool Merge(const Slice& key, const Slice* existing_value, - const Slice& value, std::string* new_value, - Logger* logger) const override { - db_test_->env_->addon_time_.fetch_add(1000); - return true; - } +TEST_F(DBTest, PreShutdownFlush) { + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "key", "value")); + CancelAllBackgroundWork(db_); + Status s = + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_TRUE(s.IsShutdownInProgress()); +} - virtual const char* Name() const override { return "DelayedMergeOperator"; } -}; +TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; -TEST_F(DBTest, MergeTestTime) { - std::string one, two, three; - PutFixed64(&one, 1); - PutFixed64(&two, 2); - PutFixed64(&three, 3); + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); - // Enable time profiling - SetPerfLevel(kEnableTime); - this->env_->addon_time_.store(0); Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; options = CurrentOptions(options); - options.statistics = rocksdb::CreateDBStatistics(); - options.merge_operator.reset(new DelayedMergeOperator(this)); - DestroyAndReopen(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; - ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); - db_->Put(WriteOptions(), "foo", one); - ASSERT_OK(Flush()); - ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); - ASSERT_OK(Flush()); - ASSERT_OK(db_->Merge(WriteOptions(), "foo", three)); - ASSERT_OK(Flush()); + TryReopen(options); + Random rnd(301); - ReadOptions opt; - opt.verify_checksums = true; - opt.snapshot = nullptr; - std::string result; - db_->Get(opt, "foo", &result); + std::vector thread_list; + // Delay both flush and compaction + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:Preshutdown"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"}, + {"DBTest::PreShutdownMultipleCompaction:Preshutdown", + "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}}); - ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 2800000); - ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 1200000); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - ReadOptions read_options; - std::unique_ptr iter(db_->NewIterator(read_options)); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - ++count; + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + } + + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + } + } + + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); + dbfull()->TEST_WaitForCompact(); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); +} - ASSERT_EQ(1, count); +TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; - ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 6000000); - ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 3200000); -#if ROCKSDB_USING_THREAD_STATUS - ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); -#endif // ROCKSDB_USING_THREAD_STATUS -} + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); -TEST_P(DBTestWithParam, MergeCompactionTimeTest) { - SetPerfLevel(kEnableTime); Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; options = CurrentOptions(options); - options.compaction_filter_factory = std::make_shared(); - options.statistics = rocksdb::CreateDBStatistics(); - options.merge_operator.reset(new DelayedMergeOperator(this)); - options.compaction_style = kCompactionStyleUniversal; + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; options.max_subcompactions = max_subcompactions_; - DestroyAndReopen(options); - for (int i = 0; i < 1000; i++) { - ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); - ASSERT_OK(Flush()); - } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + TryReopen(options); + Random rnd(301); - ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); -} + std::vector thread_list; + // Delay both flush and compaction + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBTest::PreShutdownCompactionMiddle:Preshutdown", + "CompactionJob::Run():Inprogress"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"}, + {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}}); -TEST_P(DBTestWithParam, FilterCompactionTimeTest) { - Options options; - options.compaction_filter_factory = - std::make_shared(this); - options.disable_auto_compactions = true; - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); - DestroyAndReopen(options); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - // put some data - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); } - Flush(); - } - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ(0U, CountLiveFiles()); + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } - Reopen(options); + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction"); + } + } - Iterator* itr = db_->NewIterator(ReadOptions()); - itr->SeekToFirst(); - ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); - delete itr; + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); + dbfull()->TEST_WaitForCompact(); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); } -TEST_F(DBTest, TestLogCleanup) { - Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; // very small - // only two memtables allowed ==> only two log files - options.max_write_buffer_number = 2; - Reopen(options); +#endif // ROCKSDB_USING_THREAD_STATUS - for (int i = 0; i < 100000; ++i) { - Put(Key(i), "val"); - // only 2 memtables will be alive, so logs_to_free needs to always be below - // 2 - ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); - } +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushOnDestroy) { + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(Put("foo", "v1", wo)); + CancelAllBackgroundWork(db_); } -TEST_F(DBTest, EmptyCompactedDB) { +TEST_F(DBTest, DynamicLevelCompressionPerLevel) { + if (!Snappy_Supported()) { + return; + } + const int kNKeys = 120; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + Random rnd(301); Options options; - options.max_open_files = -1; - options = CurrentOptions(options); - Close(); - ASSERT_OK(ReadOnlyReopen(options)); - Status s = Put("new", "value"); - ASSERT_TRUE(s.IsNotSupported()); - Close(); -} + options.create_if_missing = true; + options.db_write_buffer_size = 20480; + options.write_buffer_size = 20480; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 20480; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 102400; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 1; + options.num_levels = 5; -class CountingDeleteTabPropCollector : public TablePropertiesCollector { - public: - const char* Name() const override { return "CountingDeleteTabPropCollector"; } + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kNoCompression; + options.compression_per_level[2] = kSnappyCompression; - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { - if (type == kEntryDelete) { - num_deletes_++; - } - return Status::OK(); - } + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); - bool NeedCompact() const override { return num_deletes_ > 10; } + DestroyAndReopen(options); - UserCollectedProperties GetReadableProperties() const override { - return UserCollectedProperties{}; + // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should + // be compressed, so total data size should be more than 80K. + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } + Flush(); + dbfull()->TEST_WaitForCompact(); - Status Finish(UserCollectedProperties* properties) override { - *properties = - UserCollectedProperties{{"num_delete", ToString(num_deletes_)}}; - return Status::OK(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + // Assuming each files' metadata is at least 50 bytes/ + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4); + + // Insert 400KB. Some data will be compressed + for (int i = 21; i < 120; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), + 120U * 4000U + 50U * 24); + // Make sure data in files in L3 is not compacted by removing all files + // in L4 and calculate number of rows + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + for (auto file : cf_meta.levels[4].files) { + listener->SetExpectedFileName(dbname_ + file.name); + ASSERT_OK(dbfull()->DeleteFile(file.name)); + } + listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); - private: - uint32_t num_deletes_ = 0; -}; + int num_keys = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + ASSERT_OK(iter->status()); + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U); +} -class CountingDeleteTabPropCollectorFactory - : public TablePropertiesCollectorFactory { - public: - virtual TablePropertiesCollector* CreateTablePropertiesCollector() override { - return new CountingDeleteTabPropCollector(); +TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { + if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { + return; } - const char* Name() const override { - return "CountingDeleteTabPropCollectorFactory"; + const int kNKeys = 500; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; } -}; + std::random_shuffle(std::begin(keys), std::end(keys)); -TEST_F(DBTest, TablePropertiesNeedCompactTest) { Random rnd(301); - Options options; options.create_if_missing = true; - options.write_buffer_size = 4096; - options.max_write_buffer_number = 8; + options.db_write_buffer_size = 6000; + options.write_buffer_size = 6000; + options.max_write_buffer_number = 2; options.level0_file_num_compaction_trigger = 2; options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.target_file_size_base = 2048; - options.max_bytes_for_level_base = 10240; - options.max_bytes_for_level_multiplier = 4; - options.soft_rate_limit = 1.1; - options.num_levels = 8; + options.level0_stop_writes_trigger = 2; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + + // Use file size to distinguish levels + // L1: 10, L2: 20, L3 40, L4 80 + // L0 is less than 30 + options.target_file_size_base = 10; + options.target_file_size_multiplier = 2; + + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 200; + options.max_bytes_for_level_multiplier = 8; + options.max_background_compactions = 1; + options.num_levels = 5; + std::shared_ptr mtf(new mock::MockTableFactory); + options.table_factory = mtf; - std::shared_ptr collector_factory( - new CountingDeleteTabPropCollectorFactory); - options.table_properties_collector_factories.resize(1); - options.table_properties_collector_factories[0] = collector_factory; + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kLZ4Compression; + options.compression_per_level[2] = kZlibCompression; DestroyAndReopen(options); + // When base level is L4, L4 is LZ4. + std::atomic num_zlib(0); + std::atomic num_lz4(0); + std::atomic num_no(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4) { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - const int kMaxKey = 1000; - for (int i = 0; i < kMaxKey; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); - } - Flush(); - dbfull()->TEST_WaitForCompact(); - if (NumTableFilesAtLevel(0) == 1) { - // Clear Level 0 so that when later flush a file with deletions, - // we don't trigger an organic compaction. - ASSERT_OK(Put(Key(0), "")); - ASSERT_OK(Put(Key(kMaxKey * 2), "")); - Flush(); - dbfull()->TEST_WaitForCompact(); - } - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); - { - int c = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - iter->Seek(Key(kMaxKey - 100)); - while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { - iter->Next(); - ++c; + if (i % 25 == 0) { + dbfull()->TEST_WaitForFlushMemTable(); } - ASSERT_EQ(c, 200); - } - - Delete(Key(0)); - for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { - Delete(Key(i)); } - Delete(Key(kMaxKey * 2)); Flush(); + dbfull()->TEST_WaitForFlushMemTable(); dbfull()->TEST_WaitForCompact(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - { - SetPerfLevel(kEnableCount); - perf_context.Reset(); - int c = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - iter->Seek(Key(kMaxKey - 100)); - while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { - iter->Next(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), 0); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + int prev_num_files_l4 = NumTableFilesAtLevel(4); + + // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib + num_lz4.store(0); + num_no.store(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4 && compaction->start_level() == 3) { + ASSERT_TRUE(compaction->output_compression() == kZlibCompression); + num_zlib.fetch_add(1); + } else { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 101; i < 500; i++) { + ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200))); + if (i % 100 == 99) { + Flush(); + dbfull()->TEST_WaitForCompact(); } - ASSERT_EQ(c, 0); - ASSERT_LT(perf_context.internal_delete_skipped_count, 30u); - ASSERT_LT(perf_context.internal_key_skipped_count, 30u); - SetPerfLevel(kDisable); } -} -TEST_F(DBTest, SuggestCompactRangeTest) { - class CompactionFilterFactoryGetContext : public CompactionFilterFactory { - public: - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - saved_context = context; - std::unique_ptr empty_filter; - return empty_filter; - } - const char* Name() const override { - return "CompactionFilterFactoryGetContext"; - } - static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { - return reinterpret_cast( - compaction_filter_factory)->saved_context.is_manual_compaction; - } - CompactionFilter::Context saved_context; - }; + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_GT(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + ASSERT_GT(num_zlib.load(), 0); +} - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleLevel; - options.compaction_filter_factory.reset( - new CompactionFilterFactoryGetContext()); - options.write_buffer_size = 100 << 10; - options.arena_block_size = 4 << 10; - options.level0_file_num_compaction_trigger = 4; - options.num_levels = 4; +TEST_F(DBTest, DynamicCompactionOptions) { + // minimum write buffer size is enforced at 64KB + const uint64_t k32KB = 1 << 15; + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k1MB = 1 << 20; + const uint64_t k4KB = 1 << 12; + Options options; + options.env = env_; + options.create_if_missing = true; options.compression = kNoCompression; - options.max_bytes_for_level_base = 450 << 10; - options.target_file_size_base = 98 << 10; - options.max_grandparent_overlap_factor = 1 << 20; // inf - - Reopen(options); + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.write_buffer_size = k64KB; + options.arena_block_size = 4 * k4KB; + options.max_write_buffer_number = 2; + // Compaction related options + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.max_grandparent_overlap_factor = 10; + options.expanded_compaction_factor = 25; + options.source_compaction_factor = 1; + options.target_file_size_base = k64KB; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = k128KB; + options.max_bytes_for_level_multiplier = 4; - Random rnd(301); + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + DestroyAndReopen(options); - for (int num = 0; num < 3; num++) { - GenerateNewRandomFile(&rnd); - } + auto gen_l0_kb = [this](int start, int size, int stride) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); + } + dbfull()->TEST_WaitForFlushMemTable(); + }; - GenerateNewRandomFile(&rnd); - ASSERT_EQ("0,4", FilesPerLevel(0)); - ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( - options.compaction_filter_factory.get())); + // Write 3 files that have the same key range. + // Since level0_file_num_compaction_trigger is 3, compaction should be + // triggered. The compaction should result in one L1 file + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + ASSERT_LE(metadata[0].size, k64KB + k4KB); + ASSERT_GE(metadata[0].size, k64KB - k4KB); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,4", FilesPerLevel(0)); + // Test compaction trigger and target_file_size_base + // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. + // Writing to 64KB L0 files should trigger a compaction. Since these + // 2 L0 files have the same key range, compaction merge them and should + // result in 2 32KB L1 files. + ASSERT_OK(dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, + {"target_file_size_base", ToString(k32KB)}})); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("2,4", FilesPerLevel(0)); + gen_l0_kb(0, 64, 1); + ASSERT_EQ("1,1", FilesPerLevel()); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,2", FilesPerLevel()); + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_LE(metadata[0].size, k32KB + k4KB); + ASSERT_GE(metadata[0].size, k32KB - k4KB); + ASSERT_LE(metadata[1].size, k32KB + k4KB); + ASSERT_GE(metadata[1].size, k32KB - k4KB); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("3,4", FilesPerLevel(0)); + // Test max_bytes_for_level_base + // Increase level base size to 256KB and write enough data that will + // fill L1 and L2. L1 size should be around 256KB while L2 size should be + // around 256KB x 4. + ASSERT_OK( + dbfull()->SetOptions({{"max_bytes_for_level_base", ToString(k1MB)}})); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("0,4,4", FilesPerLevel(0)); + // writing 96 x 64KB => 6 * 1024KB + // (L1 + L2) = (1 + 4) * 1024KB + for (int i = 0; i < 96; ++i) { + gen_l0_kb(i, 64, 96); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(SizeAtLevel(1), k1MB / 2); + ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,4,4", FilesPerLevel(0)); + // Within (0.5, 1.5) of 4MB. + ASSERT_GT(SizeAtLevel(2), 2 * k1MB); + ASSERT_LT(SizeAtLevel(2), 6 * k1MB); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("2,4,4", FilesPerLevel(0)); + // Test max_bytes_for_level_multiplier and + // max_bytes_for_level_base. Now, reduce both mulitplier and level base, + // After filling enough data that can fit in L1 - L3, we should see L1 size + // reduces to 128KB from 256KB which was asserted previously. Same for L2. + ASSERT_OK( + dbfull()->SetOptions({{"max_bytes_for_level_multiplier", "2"}, + {"max_bytes_for_level_base", ToString(k128KB)}})); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("3,4,4", FilesPerLevel(0)); + // writing 20 x 64KB = 10 x 128KB + // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB + for (int i = 0; i < 20; ++i) { + gen_l0_kb(i, 64, 32); + } + dbfull()->TEST_WaitForCompact(); + uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); + ASSERT_TRUE(total_size < k128KB * 7 * 1.5); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("0,4,8", FilesPerLevel(0)); + // Test level0_stop_writes_trigger. + // Clean up memtable and L0. Block compaction threads. If continue to write + // and flush memtables. We should see put stop after 8 memtable flushes + // since level0_stop_writes_trigger = 8 + dbfull()->TEST_FlushMemTable(true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Block compaction + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + int count = 0; + Random rnd(301); + WriteOptions wo; + while (count < 64) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); + dbfull()->TEST_FlushMemTable(true); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } + } + // Stop trigger = 8 + ASSERT_EQ(count, 8); + // Unblock + sleeping_task_low.WaitUntilDone(); - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,4,8", FilesPerLevel(0)); + // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. + // Block compaction thread again. Perform the put and memtable flushes + // until we see the stop after 6 memtable flushes. + ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}})); + dbfull()->TEST_FlushMemTable(true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - // compact it three times - for (int i = 0; i < 3; ++i) { - ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + // Block compaction again + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + count = 0; + while (count < 64) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); + dbfull()->TEST_FlushMemTable(true); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } } + ASSERT_EQ(count, 6); + // Unblock + sleeping_task_low.WaitUntilDone(); - ASSERT_EQ("0,0,13", FilesPerLevel(0)); - - GenerateNewRandomFile(&rnd); - ASSERT_EQ("1,0,13", FilesPerLevel(0)); + // Test disable_auto_compactions + // Compaction thread is unblocked but auto compaction is disabled. Write + // 4 L0 files and compaction should be triggered. If auto compaction is + // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of + // L0 files do not change after the call. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - // nonoverlapping with the file on level 0 - Slice start("a"), end("b"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't stop + dbfull()->TEST_FlushMemTable(true); + } dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 4); - // should not compact the level 0 file - ASSERT_EQ("1,0,13", FilesPerLevel(0)); + // Enable auto compaction and perform the same test, # of L0 files should be + // reduced after compaction. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); - start = Slice("j"); - end = Slice("m"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't stop + dbfull()->TEST_FlushMemTable(true); + } dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual( - options.compaction_filter_factory.get())); - - // now it should compact the level 0 file - ASSERT_EQ("0,1,13", FilesPerLevel(0)); + ASSERT_LT(NumTableFilesAtLevel(0), 4); } +#endif // ROCKSDB_LITE -TEST_F(DBTest, PromoteL0) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.write_buffer_size = 10 * 1024 * 1024; - DestroyAndReopen(options); - - // non overlapping ranges - std::vector> ranges = { - {81, 160}, {0, 80}, {161, 240}, {241, 320}}; - - int32_t value_size = 10 * 1024; // 10 KB +TEST_F(DBTest, FileCreationRandomFailure) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = 100000; // Small write buffer + options.target_file_size_base = 200000; + options.max_bytes_for_level_base = 1000000; + options.max_bytes_for_level_multiplier = 2; + DestroyAndReopen(options); Random rnd(301); - std::map values; - for (const auto& range : ranges) { - for (int32_t j = range.first; j < range.second; j++) { - values[j] = RandomString(&rnd, value_size); - ASSERT_OK(Put(Key(j), values[j])); + + const int kCDTKeysPerBuffer = 4; + const int kTestSize = kCDTKeysPerBuffer * 4096; + const int kTotalIteration = 100; + // the second half of the test involves in random failure + // of file creation. + const int kRandomFailureTest = kTotalIteration / 2; + std::vector values; + for (int i = 0; i < kTestSize; ++i) { + values.push_back("NOT_FOUND"); + } + for (int j = 0; j < kTotalIteration; ++j) { + if (j == kRandomFailureTest) { + env_->non_writeable_rate_.store(90); + } + for (int k = 0; k < kTestSize; ++k) { + // here we expect some of the Put fails. + std::string value = RandomString(&rnd, 100); + Status s = Put(Key(k), Slice(value)); + if (s.ok()) { + // update the latest successful put + values[k] = value; + } + // But everything before we simulate the failure-test should succeed. + if (j < kRandomFailureTest) { + ASSERT_OK(s); + } } - ASSERT_OK(Flush()); } - int32_t level0_files = NumTableFilesAtLevel(0, 0); - ASSERT_EQ(level0_files, ranges.size()); - ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + // If rocksdb does not do the correct job, internal assert will fail here. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); - // Promote L0 level to L2. - ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); - // We expect that all the files were trivially moved from L0 to L2 - ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); - ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); + // verify we have the latest successful update + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } - for (const auto& kv : values) { - ASSERT_EQ(Get(Key(kv.first)), kv.second); + // reopen and reverify we have the latest successful update + env_->non_writeable_rate_.store(0); + Reopen(options); + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); } } -TEST_F(DBTest, PromoteL0Failure) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.write_buffer_size = 10 * 1024 * 1024; +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DynamicMiscOptions) { + // Test max_sequential_skip_in_iterations + Options options; + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 16; + options.compression = kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); DestroyAndReopen(options); - // Produce two L0 files with overlapping ranges. - ASSERT_OK(Put(Key(0), "")); - ASSERT_OK(Put(Key(3), "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put(Key(1), "")); - ASSERT_OK(Flush()); + auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { + int key0 = key_start; + int key1 = key_start + 1; + int key2 = key_start + 2; + Random rnd(301); + ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + } + ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(key1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key1)), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key2)), 0); + ASSERT_EQ(num_reseek, + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + }; + // No reseek + assert_reseek_count(100, 0); - Status status; - // Fails because L0 has overlapping files. - status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); - ASSERT_TRUE(status.IsInvalidArgument()); + ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}})); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // Trigger reseek + assert_reseek_count(200, 1); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - // Now there is a file in L1. - ASSERT_GE(NumTableFilesAtLevel(1, 0), 1); + ASSERT_OK( + dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}})); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // No reseek + assert_reseek_count(300, 1); - ASSERT_OK(Put(Key(5), "")); - ASSERT_OK(Flush()); - // Fails because L1 is non-empty. - status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); - ASSERT_TRUE(status.IsInvalidArgument()); + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + // Test soft_pending_compaction_bytes_limit, + // hard_pending_compaction_bytes_limit + ASSERT_OK(dbfull()->SetOptions( + handles_[1], {{"soft_pending_compaction_bytes_limit", "200"}, + {"hard_pending_compaction_bytes_limit", "300"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit); + ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit); + // Test report_bg_io_stats + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}})); + // sanity check + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(true, mutable_cf_options.report_bg_io_stats); + // Test min_partial_merge_operands + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"min_partial_merge_operands", "4"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(4, mutable_cf_options.min_partial_merge_operands); + // Test compression + // sanity check + ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression); + ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kSnappyCompression, + mutable_cf_options.compression); + // Test paranoid_file_checks already done in db_block_cache_test + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(true, mutable_cf_options.report_bg_io_stats); } +#endif // ROCKSDB_LITE -// Github issue #596 -TEST_F(DBTest, HugeNumberOfLevels) { +TEST_F(DBTest, L0L1L2AndUpHitCounter) { Options options = CurrentOptions(); - options.write_buffer_size = 2 * 1024 * 1024; // 2MB - options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB - options.num_levels = 12; - options.max_background_compactions = 10; - options.max_bytes_for_level_multiplier = 2; - options.level_compaction_dynamic_level_bytes = true; - DestroyAndReopen(options); - - Random rnd(301); - for (int i = 0; i < 300000; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); - } - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); -} + options.write_buffer_size = 32 * 1024; + options.target_file_size_base = 32 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 64 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"mypikachu"}, options); -// Github issue #595 -// Large write batch with column families -TEST_F(DBTest, LargeBatchWithColumnFamilies) { - Options options; - options.env = env_; - options = CurrentOptions(options); - options.write_buffer_size = 100000; // Small write buffer - CreateAndReopenWithCF({"pikachu"}, options); - int64_t j = 0; - for (int i = 0; i < 5; i++) { - for (int pass = 1; pass <= 3; pass++) { - WriteBatch batch; - size_t write_size = 1024 * 1024 * (5 + i); - fprintf(stderr, "prepare: %ld MB, pass:%d\n", (write_size / 1024 / 1024), - pass); - for (;;) { - std::string data(3000, j++ % 127 + 20); - data += ToString(j); - batch.Put(handles_[0], Slice(data), Slice(data)); - if (batch.GetDataSize() > write_size) { - break; - } - } - fprintf(stderr, "write: %ld MB\n", (batch.GetDataSize() / 1024 / 1024)); - ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); - fprintf(stderr, "done\n"); - } + int numkeys = 20000; + for (int i = 0; i < numkeys; i++) { + ASSERT_OK(Put(1, Key(i), "val")); } - // make sure we can re-open it. - ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + for (int i = 0; i < numkeys; i++) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); + + ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); } -// Make sure that Flushes can proceed in parallel with CompactRange() -TEST_F(DBTest, FlushesInParallelWithCompactRange) { - // iter == 0 -- leveled - // iter == 1 -- leveled, but throw in a flush between two levels compacting - // iter == 2 -- universal - for (int iter = 0; iter < 3; ++iter) { - Options options = CurrentOptions(); - if (iter < 2) { - options.compaction_style = kCompactionStyleLevel; - } else { - options.compaction_style = kCompactionStyleUniversal; +TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { + // iter 0 -- zlib + // iter 1 -- bzip2 + // iter 2 -- lz4 + // iter 3 -- lz4HC + // iter 4 -- xpress + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression, + kXpressCompression}; + for (auto comp : compressions) { + if (!CompressionTypeSupported(comp)) { + continue; } - options.write_buffer_size = 110 << 10; - options.level0_file_num_compaction_trigger = 4; - options.num_levels = 4; - options.compression = kNoCompression; - options.max_bytes_for_level_base = 450 << 10; - options.target_file_size_base = 98 << 10; - options.max_write_buffer_number = 2; + // first_table_version 1 -- generate with table_version == 1, read with + // table_version == 2 + // first_table_version 2 -- generate with table_version == 2, read with + // table_version == 1 + for (int first_table_version = 1; first_table_version <= 2; + ++first_table_version) { + BlockBasedTableOptions table_options; + table_options.format_version = first_table_version; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.compression = comp; + DestroyAndReopen(options); - DestroyAndReopen(options); + int kNumKeysWritten = 100000; - Random rnd(301); - for (int num = 0; num < 14; num++) { - GenerateNewRandomFile(&rnd); - } + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + } - if (iter == 1) { - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::RunManualCompaction()::1", - "DBTest::FlushesInParallelWithCompactRange:1"}, - {"DBTest::FlushesInParallelWithCompactRange:2", - "DBImpl::RunManualCompaction()::2"}}); - } else { - rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"CompactionJob::Run():Start", - "DBTest::FlushesInParallelWithCompactRange:1"}, - {"DBTest::FlushesInParallelWithCompactRange:2", - "CompactionJob::Run():End"}}); + table_options.format_version = first_table_version == 1 ? 2 : 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + ASSERT_EQ(r.substr(128), std::string(128, 'a')); + } } - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + } +} - std::vector threads; - threads.emplace_back([&]() { Compact("a", "z"); }); +TEST_F(DBTest, CompressionStatsTest) { + CompressionType type; - TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1"); + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (XPRESS_Supported()) { + type = kXpressCompression; + fprintf(stderr, "using xpress\n"); + } else if (ZSTD_Supported()) { + type = kZSTDNotFinalCompression; + fprintf(stderr, "using ZSTD\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return; + } - // this has to start a flush. if flushes are blocked, this will try to - // create - // 3 memtables, and that will fail because max_write_buffer_number is 2 - for (int num = 0; num < 3; num++) { - GenerateNewRandomFile(&rnd, /* nowait */ true); - } + Options options = CurrentOptions(); + options.compression = type; + options.statistics = rocksdb::CreateDBStatistics(); + options.statistics->stats_level_ = StatsLevel::kAll; + DestroyAndReopen(options); - TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2"); + int kNumKeysWritten = 100000; - for (auto& t : threads) { - t.join(); - } - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + // Check that compressions occur and are counted when compression is turned on + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); } -} + ASSERT_OK(Flush()); + ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0); -TEST_F(DBTest, DelayedWriteRate) { - Options options; - options.env = env_; - env_->no_sleep_ = true; - options = CurrentOptions(options); - options.write_buffer_size = 100000; // Small write buffer - options.max_write_buffer_number = 256; - options.disable_auto_compactions = true; - options.level0_file_num_compaction_trigger = 3; - options.level0_slowdown_writes_trigger = 3; - options.level0_stop_writes_trigger = 999999; - options.delayed_write_rate = 200000; // About 200KB/s limited rate + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0); - CreateAndReopenWithCF({"pikachu"}, options); + options.compression = kNoCompression; + DestroyAndReopen(options); + uint64_t currentCompressions = + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); + uint64_t currentDecompressions = + options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED); - for (int i = 0; i < 3; i++) { - Put(Key(i), std::string(10000, 'x')); - Flush(); + // Check that compressions do not occur when turned off + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); } + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) + - currentCompressions, 0); - // These writes will be slowed down to 1KB/s - size_t estimated_total_size = 0; - Random rnd(301); - for (int i = 0; i < 3000; i++) { - auto rand_num = rnd.Uniform(20); - // Spread the size range to more. - size_t entry_size = rand_num * rand_num * rand_num; - WriteOptions wo; - Put(Key(i), std::string(entry_size, 'x'), wo); - estimated_total_size += entry_size + 20; - // Ocassionally sleep a while - if (rnd.Uniform(20) == 6) { - env_->SleepForMicroseconds(2666); - } + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); } - uint64_t estimated_sleep_time = - estimated_total_size / options.delayed_write_rate * 1000000U; - ASSERT_GT(env_->addon_time_.load(), estimated_sleep_time * 0.8); - ASSERT_LT(env_->addon_time_.load(), estimated_sleep_time * 1.1); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) + - currentDecompressions, 0); +} - env_->no_sleep_ = false; - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +TEST_F(DBTest, MutexWaitStatsDisabledByDefault) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); } -TEST_F(DBTest, HardLimit) { - Options options; - options.env = env_; - env_->SetBackgroundThreads(1, Env::LOW); - options = CurrentOptions(options); - options.max_write_buffer_number = 256; +TEST_F(DBTest, MutexWaitStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.statistics->stats_level_ = StatsLevel::kAll; + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); +} + +TEST_F(DBTest, CloseSpeedup) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB - options.arena_block_size = 4 * 1024; - options.level0_file_num_compaction_trigger = 4; - options.level0_slowdown_writes_trigger = 999999; - options.level0_stop_writes_trigger = 999999; - options.hard_pending_compaction_bytes_limit = 800 << 10; - options.max_bytes_for_level_base = 10000000000u; - options.max_background_compactions = 1; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_write_buffer_number = 16; + // Block background threads env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); test::SleepingBackgroundTask sleeping_task_low; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); - CreateAndReopenWithCF({"pikachu"}, options); + std::vector filenames; + env_->GetChildren(dbname_, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + env_->DeleteDir(dbname_); + DestroyAndReopen(options); - std::atomic callback_count(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack("DBImpl::DelayWrite:Wait", - [&](void* arg) { - callback_count.fetch_add(1); - sleeping_task_low.WakeUp(); - }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); Random rnd(301); int key_idx = 0; + + // First three 110KB files are not going to level 2 + // After that, (100K, 200K) for (int num = 0; num < 5; num++) { GenerateNewFile(&rnd, &key_idx, true); } - ASSERT_EQ(0, callback_count.load()); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + Close(); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Unblock background threads + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + Destroy(options); +} + +class DelayedMergeOperator : public MergeOperator { + private: + DBTest* db_test_; + + public: + explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + db_test_->env_->addon_time_.fetch_add(1000); + merge_out->new_value = ""; + return true; + } + + virtual const char* Name() const override { return "DelayedMergeOperator"; } +}; + +TEST_F(DBTest, MergeTestTime) { + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + // Enable time profiling + SetPerfLevel(kEnableTime); + this->env_->addon_time_.store(0); + this->env_->time_elapse_only_sleep_ = true; + this->env_->no_sleep_ = true; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + DestroyAndReopen(options); + + ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); + db_->Put(WriteOptions(), "foo", one); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", three)); + ASSERT_OK(Flush()); + + ReadOptions opt; + opt.verify_checksums = true; + opt.snapshot = nullptr; + std::string result; + db_->Get(opt, "foo", &result); - for (int num = 0; num < 5; num++) { - GenerateNewFile(&rnd, &key_idx, true); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); + + ReadOptions read_options; + std::unique_ptr iter(db_->NewIterator(read_options)); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; } - ASSERT_GE(callback_count.load(), 1); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(1, count); + ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); +#if ROCKSDB_USING_THREAD_STATUS + ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); +#endif // ROCKSDB_USING_THREAD_STATUS + this->env_->time_elapse_only_sleep_ = false; } -TEST_F(DBTest, SoftLimit) { - Options options; - options.env = env_; - options = CurrentOptions(options); - options.write_buffer_size = 100000; // Small write buffer - options.max_write_buffer_number = 256; - options.level0_file_num_compaction_trigger = 3; - options.level0_slowdown_writes_trigger = 3; - options.level0_stop_writes_trigger = 999999; - options.delayed_write_rate = 200000; // About 200KB/s limited rate - options.soft_rate_limit = 1.1; - options.target_file_size_base = 99999999; // All into one file - options.max_bytes_for_level_base = 50000; - options.compression = kNoCompression; +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, MergeCompactionTimeTest) { + SetPerfLevel(kEnableTime); + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.statistics = rocksdb::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + options.compaction_style = kCompactionStyleUniversal; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); - Reopen(options); - Put(Key(0), ""); + for (int i = 0; i < 1000; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); + ASSERT_OK(Flush()); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); - // Only allow two compactions - port::Mutex mut; - port::CondVar cv(&mut); - std::atomic compaction_cnt(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "VersionSet::LogAndApply:WriteManifest", [&](void* arg) { - // Three flushes and the first compaction, - // three flushes and the second compaction go through. - MutexLock l(&mut); - while (compaction_cnt.load() >= 8) { - cv.Wait(); - } - compaction_cnt.fetch_add(1); - }); + ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); +} - std::atomic sleep_count(0); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DelayWrite:Sleep", [&](void* arg) { sleep_count.fetch_add(1); }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +TEST_P(DBTestWithParam, FilterCompactionTimeTest) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(this); + options.disable_auto_compactions = true; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); - for (int i = 0; i < 3; i++) { - Put(Key(i), std::string(5000, 'x')); - Put(Key(100 - i), std::string(5000, 'x')); + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } Flush(); } - while (compaction_cnt.load() < 4 || NumTableFilesAtLevel(0) > 0) { - env_->SleepForMicroseconds(1000); - } - // Now there is one L1 file but doesn't trigger soft_rate_limit - ASSERT_EQ(NumTableFilesAtLevel(1), 1); - ASSERT_EQ(sleep_count.load(), 0); - for (int i = 0; i < 3; i++) { - Put(Key(10 + i), std::string(5000, 'x')); - Put(Key(90 - i), std::string(5000, 'x')); - Flush(); - } - while (compaction_cnt.load() < 8 || NumTableFilesAtLevel(0) > 0) { - env_->SleepForMicroseconds(1000); - } - ASSERT_EQ(NumTableFilesAtLevel(1), 1); - ASSERT_EQ(sleep_count.load(), 0); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); - // Slowdown is triggered now - for (int i = 0; i < 10; i++) { - Put(Key(i), std::string(100, 'x')); - } - ASSERT_GT(sleep_count.load(), 0); + Reopen(options); - { - MutexLock l(&mut); - compaction_cnt.store(7); - cv.SignalAll(); - } - while (NumTableFilesAtLevel(1) > 0) { - env_->SleepForMicroseconds(1000); - } + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); + delete itr; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, TestLogCleanup) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // very small + // only two memtables allowed ==> only two log files + options.max_write_buffer_number = 2; + Reopen(options); - // Slowdown is not triggered any more. - sleep_count.store(0); - // Slowdown is not triggered now - for (int i = 0; i < 10; i++) { - Put(Key(i), std::string(100, 'x')); + for (int i = 0; i < 100000; ++i) { + Put(Key(i), "val"); + // only 2 memtables will be alive, so logs_to_free needs to always be below + // 2 + ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); } - ASSERT_EQ(sleep_count.load(), 0); +} - // shrink level base so L2 will hit soft limit easier. - ASSERT_OK(dbfull()->SetOptions({ - {"max_bytes_for_level_base", "5000"}, - })); - compaction_cnt.store(7); - Flush(); +#ifndef ROCKSDB_LITE +TEST_F(DBTest, EmptyCompactedDB) { + Options options = CurrentOptions(); + options.max_open_files = -1; + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_TRUE(s.IsNotSupported()); + Close(); +} +#endif // ROCKSDB_LITE - while (NumTableFilesAtLevel(0) == 0) { - env_->SleepForMicroseconds(1000); - } +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SuggestCompactRangeTest) { + class CompactionFilterFactoryGetContext : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + saved_context = context; + std::unique_ptr empty_filter; + return empty_filter; + } + const char* Name() const override { + return "CompactionFilterFactoryGetContext"; + } + static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { + return reinterpret_cast( + compaction_filter_factory) + ->saved_context.is_manual_compaction; + } + CompactionFilter::Context saved_context; + }; - // Slowdown is triggered now - for (int i = 0; i < 10; i++) { - Put(Key(i), std::string(100, 'x')); - } - ASSERT_GT(sleep_count.load(), 0); + Options options = CurrentOptions(); + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleLevel; + options.compaction_filter_factory.reset( + new CompactionFilterFactoryGetContext()); + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_grandparent_overlap_factor = 1 << 20; // inf - { - MutexLock l(&mut); - compaction_cnt.store(7); - cv.SignalAll(); - } + Reopen(options); - while (NumTableFilesAtLevel(2) != 0) { - env_->SleepForMicroseconds(1000); - } + Random rnd(301); - // Slowdown is not triggered anymore - sleep_count.store(0); - for (int i = 0; i < 10; i++) { - Put(Key(i), std::string(100, 'x')); + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd); } - ASSERT_EQ(sleep_count.load(), 0); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); -} -TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { - CompressionType compressions[] = {kZlibCompression, kBZip2Compression, - kLZ4Compression, kLZ4HCCompression}; - for (int iter = 0; iter < 4; ++iter) { - if (!CompressionTypeSupported(compressions[iter])) { - // not supported, we should fail the Open() - Options options = CurrentOptions(); - options.compression = compressions[iter]; - ASSERT_TRUE(!TryReopen(options).ok()); - // Try if CreateColumnFamily also fails - options.compression = kNoCompression; - ASSERT_OK(TryReopen(options)); - ColumnFamilyOptions cf_options(options); - cf_options.compression = compressions[iter]; - ColumnFamilyHandle* handle; - ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); - } - } -} + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4", FilesPerLevel(0)); + ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); -TEST_F(DBTest, RowCache) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - options.row_cache = NewLRUCache(8192); - DestroyAndReopen(options); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4", FilesPerLevel(0)); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Flush()); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("2,4", FilesPerLevel(0)); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); - ASSERT_EQ(Get("foo"), "bar"); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); - ASSERT_EQ(Get("foo"), "bar"); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); -} + GenerateNewRandomFile(&rnd); + ASSERT_EQ("3,4", FilesPerLevel(0)); -// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary -// return the biggest key which is smaller than the seek key. -TEST_F(DBTest, PrevAfterMerge) { - Options options; - options.create_if_missing = true; - options.merge_operator = MergeOperators::CreatePutOperator(); - DestroyAndReopen(options); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4,4", FilesPerLevel(0)); - // write three entries with different keys using Merge() - WriteOptions wopts; - db_->Merge(wopts, "1", "data1"); - db_->Merge(wopts, "2", "data2"); - db_->Merge(wopts, "3", "data3"); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); - std::unique_ptr it(db_->NewIterator(ReadOptions())); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("2,4,4", FilesPerLevel(0)); - it->Seek("2"); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("2", it->key().ToString()); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("3,4,4", FilesPerLevel(0)); - it->Prev(); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("1", it->key().ToString()); -} + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4,8", FilesPerLevel(0)); -TEST_F(DBTest, DeletingOldWalAfterDrop) { - rocksdb::SyncPoint::GetInstance()->LoadDependency( - { { "Test:AllowFlushes", "DBImpl::BGWorkFlush" }, - { "DBImpl::BGWorkFlush:done", "Test:WaitForFlush"} }); - rocksdb::SyncPoint::GetInstance()->ClearTrace(); + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - Options options = CurrentOptions(); - options.max_total_wal_size = 8192; - options.compression = kNoCompression; - options.write_buffer_size = 1 << 20; - options.level0_file_num_compaction_trigger = (1<<30); - options.level0_slowdown_writes_trigger = (1<<30); - options.level0_stop_writes_trigger = (1<<30); - options.disable_auto_compactions = true; - DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // compact it three times + for (int i = 0; i < 3; ++i) { + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + dbfull()->TEST_WaitForCompact(); + } - CreateColumnFamilies({"cf1", "cf2"}, options); - ASSERT_OK(Put(0, "key1", DummyString(8192))); - ASSERT_OK(Put(0, "key2", DummyString(8192))); - // the oldest wal should now be getting_flushed - ASSERT_OK(db_->DropColumnFamily(handles_[0])); - // all flushes should now do nothing because their CF is dropped - TEST_SYNC_POINT("Test:AllowFlushes"); - TEST_SYNC_POINT("Test:WaitForFlush"); - uint64_t lognum1 = dbfull()->TEST_LogfileNumber(); - ASSERT_OK(Put(1, "key3", DummyString(8192))); - ASSERT_OK(Put(1, "key4", DummyString(8192))); - // new wal should have been created - uint64_t lognum2 = dbfull()->TEST_LogfileNumber(); - EXPECT_GT(lognum2, lognum1); -} + // All files are compacted + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // nonoverlapping with the file on level 0 + Slice start("a"), end("b"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + dbfull()->TEST_WaitForCompact(); -TEST_F(DBTest, RateLimitedDelete) { - rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"DBTest::RateLimitedDelete:1", - "DeleteSchedulerImpl::BackgroundEmptyTrash"}, - }); + // should not compact the level 0 file + ASSERT_EQ(1, NumTableFilesAtLevel(0)); - std::vector penalties; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", - [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + start = Slice("j"); + end = Slice("m"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); + + // now it should compact the level 0 file + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); +} +TEST_F(DBTest, PromoteL0) { Options options = CurrentOptions(); options.disable_auto_compactions = true; - options.env = env_; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); - std::string trash_dir = test::TmpDir(env_) + "/trash"; - int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec - Status s; - options.delete_scheduler.reset(NewDeleteScheduler( - env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); - ASSERT_OK(s); + // non overlapping ranges + std::vector> ranges = { + {81, 160}, {0, 80}, {161, 240}, {241, 320}}; - Destroy(last_options_); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(TryReopen(options)); - // Create 4 files in L0 - for (char v = 'a'; v <= 'd'; v++) { - ASSERT_OK(Put("Key2", DummyString(1024, v))); - ASSERT_OK(Put("Key3", DummyString(1024, v))); - ASSERT_OK(Put("Key4", DummyString(1024, v))); - ASSERT_OK(Put("Key1", DummyString(1024, v))); - ASSERT_OK(Put("Key4", DummyString(1024, v))); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (const auto& range : ranges) { + for (int32_t j = range.first; j < range.second; j++) { + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } ASSERT_OK(Flush()); } - // We created 4 sst files in L0 - ASSERT_EQ("4", FilesPerLevel(0)); - - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - // Compaction will move the 4 files in L0 to trash and create 1 L1 file - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 - uint64_t delete_start_time = env_->NowMicros(); - // Hold BackgroundEmptyTrash - TEST_SYNC_POINT("DBTest::RateLimitedDelete:1"); - options.delete_scheduler->WaitForEmptyTrash(); - uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + // Promote L0 level to L2. + ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); + // We expect that all the files were trivially moved from L0 to L2 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); - uint64_t total_files_size = 0; - uint64_t expected_penlty = 0; - ASSERT_EQ(penalties.size(), metadata.size()); - for (size_t i = 0; i < metadata.size(); i++) { - total_files_size += metadata[i].size; - expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec); - ASSERT_EQ(expected_penlty, penalties[i]); + for (const auto& kv : values) { + ASSERT_EQ(Get(Key(kv.first)), kv.second); } - ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); - - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -// Create a DB with 2 db_paths, and generate multiple files in the 2 -// db_paths using CompactRangeOptions, make sure that files that were -// deleted from first db_path were deleted using DeleteScheduler and -// files in the second path were not. -TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { - int bg_delete_file = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", - [&](void* arg) { bg_delete_file++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - +TEST_F(DBTest, PromoteL0Failure) { Options options = CurrentOptions(); options.disable_auto_compactions = true; - options.db_paths.emplace_back(dbname_, 1024 * 100); - options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100); - options.env = env_; - - std::string trash_dir = test::TmpDir(env_) + "/trash"; - int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec - Status s; - options.delete_scheduler.reset(NewDeleteScheduler( - env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); - ASSERT_OK(s); - + options.write_buffer_size = 10 * 1024 * 1024; DestroyAndReopen(options); - // Create 4 files in L0 - for (int i = 0; i < 4; i++) { - ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); - ASSERT_OK(Flush()); - } - // We created 4 sst files in L0 - ASSERT_EQ("4", FilesPerLevel(0)); - // Compaction will delete files from L0 in first db path and generate a new - // file in L1 in second db path - CompactRangeOptions compact_options; - compact_options.target_path_id = 1; - Slice begin("Key0"); - Slice end("Key3"); - ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); - ASSERT_EQ("0,1", FilesPerLevel(0)); - - // Create 4 files in L0 - for (int i = 4; i < 8; i++) { - ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'))); - ASSERT_OK(Flush()); - } - ASSERT_EQ("4,1", FilesPerLevel(0)); + // Produce two L0 files with overlapping ranges. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(3), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Flush()); - // Compaction will delete files from L0 in first db path and generate a new - // file in L1 in second db path - begin = "Key4"; - end = "Key7"; - ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); - ASSERT_EQ("0,2", FilesPerLevel(0)); + Status status; + // Fails because L0 has overlapping files. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); - options.delete_scheduler->WaitForEmptyTrash(); - ASSERT_EQ(bg_delete_file, 8); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Now there is a file in L1. + ASSERT_GE(NumTableFilesAtLevel(1, 0), 1); - compact_options.bottommost_level_compaction = - BottommostLevelCompaction::kForce; - ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_OK(Put(Key(5), "")); + ASSERT_OK(Flush()); + // Fails because L1 is non-empty. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); +} +#endif // ROCKSDB_LITE - options.delete_scheduler->WaitForEmptyTrash(); - ASSERT_EQ(bg_delete_file, 8); +// Github issue #596 +TEST_F(DBTest, HugeNumberOfLevels) { + Options options = CurrentOptions(); + options.write_buffer_size = 2 * 1024 * 1024; // 2MB + options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB + options.num_levels = 12; + options.max_background_compactions = 10; + options.max_bytes_for_level_multiplier = 2; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); -} + Random rnd(301); + for (int i = 0; i < 300000; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } -TEST_F(DBTest, DestroyDBWithRateLimitedDelete) { - int bg_delete_file = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", - [&](void* arg) { bg_delete_file++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +} +TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.env = env_; + options.write_buffer_size = 2 * 1024 * 1024; // 2MB + options.max_bytes_for_level_base = 2 * 1024 * 1024; // 2MB + options.num_levels = 12; + options.max_background_compactions = 10; + options.max_bytes_for_level_multiplier = 2; + options.level_compaction_dynamic_level_bytes = true; DestroyAndReopen(options); - // Create 4 files in L0 - for (int i = 0; i < 4; i++) { - ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); - ASSERT_OK(Flush()); + Random rnd(301); + for (int i = 0; i < 300000; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); } - // We created 4 sst files in L0 - ASSERT_EQ("4", FilesPerLevel(0)); - - // Close DB and destory it using DeleteScheduler - Close(); - std::string trash_dir = test::TmpDir(env_) + "/trash"; - int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec - Status s; - options.delete_scheduler.reset(NewDeleteScheduler( - env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); - ASSERT_OK(s); - ASSERT_OK(DestroyDB(dbname_, options)); - options.delete_scheduler->WaitForEmptyTrash(); - // We have deleted the 4 sst files in the delete_scheduler - ASSERT_EQ(bg_delete_file, 4); + std::atomic callback_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction()::Conflict", + [&](void* arg) { callback_count.fetch_add(1); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + CompactRangeOptions croptions; + croptions.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); + ASSERT_GE(callback_count.load(), 1); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + for (int i = 0; i < 300000; ++i) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } } -TEST_F(DBTest, UnsupportedManualSync) { - DestroyAndReopen(CurrentOptions()); - env_->is_wal_sync_thread_safe_.store(false); - Status s = db_->SyncWAL(); - ASSERT_TRUE(s.IsNotSupported()); +// Github issue #595 +// Large write batch with column families +TEST_F(DBTest, LargeBatchWithColumnFamilies) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); + int64_t j = 0; + for (int i = 0; i < 5; i++) { + for (int pass = 1; pass <= 3; pass++) { + WriteBatch batch; + size_t write_size = 1024 * 1024 * (5 + i); + fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n", + (write_size / 1024 / 1024), pass); + for (;;) { + std::string data(3000, j++ % 127 + 20); + data += ToString(j); + batch.Put(handles_[0], Slice(data), Slice(data)); + if (batch.GetDataSize() > write_size) { + break; + } + } + fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n", + (batch.GetDataSize() / 1024 / 1024)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + fprintf(stderr, "done\n"); + } + } + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } -TEST_F(DBTest, OpenDBWithInfiniteMaxOpenFiles) { - // Open DB with infinite max open files - // - First iteration use 1 thread to open files - // - Second iteration use 5 threads to open files - for (int iter = 0; iter < 2; iter++) { - Options options; - options.create_if_missing = true; - options.write_buffer_size = 100000; - options.disable_auto_compactions = true; - options.max_open_files = -1; - if (iter == 0) { - options.max_file_opening_threads = 1; +// Make sure that Flushes can proceed in parallel with CompactRange() +TEST_F(DBTest, FlushesInParallelWithCompactRange) { + // iter == 0 -- leveled + // iter == 1 -- leveled, but throw in a flush between two levels compacting + // iter == 2 -- universal + for (int iter = 0; iter < 3; ++iter) { + Options options = CurrentOptions(); + if (iter < 2) { + options.compaction_style = kCompactionStyleLevel; } else { - options.max_file_opening_threads = 5; + options.compaction_style = kCompactionStyleUniversal; } - options = CurrentOptions(options); + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + DestroyAndReopen(options); - // Create 12 Files in L0 (then move then to L2) - for (int i = 0; i < 12; i++) { - std::string k = "L2_" + Key(i); - ASSERT_OK(Put(k, k + std::string(1000, 'a'))); - ASSERT_OK(Flush()); + Random rnd(301); + for (int num = 0; num < 14; num++) { + GenerateNewRandomFile(&rnd); } - CompactRangeOptions compact_options; - compact_options.change_level = true; - compact_options.target_level = 2; - db_->CompactRange(compact_options, nullptr, nullptr); - // Create 12 Files in L0 - for (int i = 0; i < 12; i++) { - std::string k = "L0_" + Key(i); - ASSERT_OK(Put(k, k + std::string(1000, 'a'))); - ASSERT_OK(Flush()); + if (iter == 1) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction()::1", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "DBImpl::RunManualCompaction()::2"}}); + } else { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "CompactionJob::Run():End"}}); } - Close(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - // Reopening the DB will load all exisitng files - Reopen(options); - ASSERT_EQ("12,0,12", FilesPerLevel(0)); - std::vector> files; - dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + std::vector threads; + threads.emplace_back([&]() { Compact("a", "z"); }); - for (const auto& level : files) { - for (const auto& file : level) { - ASSERT_TRUE(file.table_reader_handle != nullptr); - } + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1"); + + // this has to start a flush. if flushes are blocked, this will try to + // create + // 3 memtables, and that will fail because max_write_buffer_number is 2 + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); } - for (int i = 0; i < 12; i++) { - ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a')); - ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a')); + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2"); + + for (auto& t : threads) { + t.join(); } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } } -TEST_F(DBTest, GetTotalSstFilesSize) { +TEST_F(DBTest, DelayedWriteRate) { + const int kEntriesPerMemTable = 100; + const int kTotalFlushes = 20; + Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.compression = kNoCompression; - DestroyAndReopen(options); - // Generate 5 files in L0 - for (int i = 0; i < 5; i++) { - for (int j = 0; j < 10; j++) { - std::string val = "val_file_" + ToString(i); - ASSERT_OK(Put(Key(j), val)); - } + env_->SetBackgroundThreads(1, Env::LOW); + options.env = env_; + env_->no_sleep_ = true; + options.write_buffer_size = 100000000; + options.max_write_buffer_number = 256; + options.max_background_compactions = 1; + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000000; // Start with 200MB/s + options.memtable_factory.reset( + new SpecialSkipListFactory(kEntriesPerMemTable)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Block compactions + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int i = 0; i < 3; i++) { + Put(Key(i), std::string(10000, 'x')); Flush(); } - ASSERT_EQ("5", FilesPerLevel(0)); - - std::vector live_files_meta; - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 5); - uint64_t single_file_size = live_files_meta[0].size; - uint64_t live_sst_files_size = 0; - uint64_t total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; + // These writes will be slowed down to 1KB/s + uint64_t estimated_sleep_time = 0; + Random rnd(301); + Put("", ""); + uint64_t cur_rate = options.delayed_write_rate; + for (int i = 0; i < kTotalFlushes; i++) { + uint64_t size_memtable = 0; + for (int j = 0; j < kEntriesPerMemTable; j++) { + auto rand_num = rnd.Uniform(20); + // Spread the size range to more. + size_t entry_size = rand_num * rand_num * rand_num; + WriteOptions wo; + Put(Key(i), std::string(entry_size, 'x'), wo); + size_memtable += entry_size + 18; + // Occasionally sleep a while + if (rnd.Uniform(20) == 6) { + env_->SleepForMicroseconds(2666); + } + } + dbfull()->TEST_WaitForFlushMemTable(); + estimated_sleep_time += size_memtable * 1000000u / cur_rate; + // Slow down twice. One for memtable switch and one for flush finishes. + cur_rate = static_cast(static_cast(cur_rate) / + kSlowdownRatio / kSlowdownRatio); } + // Estimate the total sleep time fall into the rough range. + ASSERT_GT(env_->addon_time_.load(), + static_cast(estimated_sleep_time / 2)); + ASSERT_LT(env_->addon_time_.load(), + static_cast(estimated_sleep_time * 2)); + + env_->no_sleep_ = false; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 5 - // Total SST files = 5 - ASSERT_EQ(live_sst_files_size, 5 * single_file_size); - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); +TEST_F(DBTest, HardLimit) { + Options options = CurrentOptions(); + options.env = env_; + env_->SetBackgroundThreads(1, Env::LOW); + options.max_write_buffer_number = 256; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 999999; + options.level0_stop_writes_trigger = 999999; + options.hard_pending_compaction_bytes_limit = 800 << 10; + options.max_bytes_for_level_base = 10000000000u; + options.max_background_compactions = 1; + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); - // hold current version - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); - // Compact 5 files into 1 file in L0 - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); + CreateAndReopenWithCF({"pikachu"}, options); - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 1); + std::atomic callback_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack("DBImpl::DelayWrite:Wait", + [&](void* arg) { + callback_count.fetch_add(1); + sleeping_task_low.WakeUp(); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - live_sst_files_size = 0; - total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; + Random rnd(301); + int key_idx = 0; + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + dbfull()->TEST_WaitForFlushMemTable(); } - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 1 (compacted file) - // Total SST files = 6 (5 original files + compacted file) - ASSERT_EQ(live_sst_files_size, 1 * single_file_size); - ASSERT_EQ(total_sst_files_size, 6 * single_file_size); - // hold current version - std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_EQ(0, callback_count.load()); - // Delete all keys and compact, this will delete all live files - for (int i = 0; i < 10; i++) { - ASSERT_OK(Delete(Key(i))); + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + dbfull()->TEST_WaitForFlushMemTable(); } - Flush(); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("", FilesPerLevel(0)); - - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 0); - - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 6 (5 original files + compacted file) - ASSERT_EQ(total_sst_files_size, 6 * single_file_size); - - iter1.reset(); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 1 (compacted file) - ASSERT_EQ(total_sst_files_size, 1 * single_file_size); - - iter2.reset(); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 0 - ASSERT_EQ(total_sst_files_size, 0); + ASSERT_GE(callback_count.load(), 1); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WaitUntilDone(); } -TEST_F(DBTest, GetTotalSstFilesSizeVersionsFilesShared) { +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SoftLimit) { Options options = CurrentOptions(); - options.disable_auto_compactions = true; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.max_write_buffer_number = 256; + options.level0_file_num_compaction_trigger = 1; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000; // About 200KB/s limited rate + options.soft_pending_compaction_bytes_limit = 160000; + options.target_file_size_base = 99999999; // All into one file + options.max_bytes_for_level_base = 50000; + options.max_bytes_for_level_multiplier = 10; + options.max_background_compactions = 1; options.compression = kNoCompression; - DestroyAndReopen(options); - // Generate 5 files in L0 - for (int i = 0; i < 5; i++) { - ASSERT_OK(Put(Key(i), "val")); - Flush(); - } - ASSERT_EQ("5", FilesPerLevel(0)); - std::vector live_files_meta; - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 5); - uint64_t single_file_size = live_files_meta[0].size; + Reopen(options); - uint64_t live_sst_files_size = 0; - uint64_t total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; + // Generating 360KB in Level 3 + for (int i = 0; i < 72; i++) { + Put(Key(i), std::string(5000, 'x')); + if (i % 10 == 0) { + Flush(); + } } + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(3); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - - // Live SST files = 5 - // Total SST files = 5 - ASSERT_EQ(live_sst_files_size, 5 * single_file_size); - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); - - // hold current version - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + // Generating 360KB in Level 2 + for (int i = 0; i < 72; i++) { + Put(Key(i), std::string(5000, 'x')); + if (i % 10 == 0) { + Flush(); + } + } + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(2); - // Compaction will do trivial move from L0 to L1 - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("0,5", FilesPerLevel(0)); + Put(Key(0), ""); - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 5); + test::SleepingBackgroundTask sleeping_task_low; + // Block compactions + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); - live_sst_files_size = 0; - total_sst_files_size = 0; - for (const auto& file_meta : live_files_meta) { - live_sst_files_size += file_meta.size; + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + Put(Key(i), std::string(5000, 'x')); + Put(Key(100 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + Flush(); } - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 5 - // Total SST files = 5 (used in 2 version) - ASSERT_EQ(live_sst_files_size, 5 * single_file_size); - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - // hold current version - std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + sleeping_task_low.Reset(); + dbfull()->TEST_WaitForCompact(); - // Delete all keys and compact, this will delete all live files - for (int i = 0; i < 5; i++) { - ASSERT_OK(Delete(Key(i))); - } - Flush(); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ("", FilesPerLevel(0)); - - live_files_meta.clear(); - dbfull()->GetLiveFilesMetaData(&live_files_meta); - ASSERT_EQ(live_files_meta.size(), 0); - - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 5 (used in 2 version) - ASSERT_EQ(total_sst_files_size, 5 * single_file_size); - - iter1.reset(); - iter2.reset(); - - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", - &total_sst_files_size)); - // Live SST files = 0 - // Total SST files = 0 - ASSERT_EQ(total_sst_files_size, 0); -} + // Now there is one L1 file but doesn't trigger soft_rate_limit + // The L1 file size is around 30KB. + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); -TEST_F(DBTest, AddExternalSstFile) { - do { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - env_->CreateDir(sst_files_folder); - Options options = CurrentOptions(); - options.env = env_; - const ImmutableCFOptions ioptions(options); + // Only allow one compactin going through. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* arg) { + // Schedule a sleeping task. + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_low, Env::Priority::LOW); + }); - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - // file1.sst (0 => 99) - std::string file1 = sst_files_folder + "file1.sst"; - ASSERT_OK(sst_file_writer.Open(file1)); - for (int k = 0; k < 100; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file1_info.file_path, file1); - ASSERT_EQ(file1_info.num_entries, 100); - ASSERT_EQ(file1_info.smallest_key, Key(0)); - ASSERT_EQ(file1_info.largest_key, Key(99)); - // sst_file_writer already finished, cannot add this value - s = sst_file_writer.Add(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); - - // file2.sst (100 => 199) - std::string file2 = sst_files_folder + "file2.sst"; - ASSERT_OK(sst_file_writer.Open(file2)); - for (int k = 100; k < 200; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - // Cannot add this key because it's not after last added key - s = sst_file_writer.Add(Key(99), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); - ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file2_info.file_path, file2); - ASSERT_EQ(file2_info.num_entries, 100); - ASSERT_EQ(file2_info.smallest_key, Key(100)); - ASSERT_EQ(file2_info.largest_key, Key(199)); - - // file3.sst (195 => 299) - // This file values overlap with file2 values - std::string file3 = sst_files_folder + "file3.sst"; - ASSERT_OK(sst_file_writer.Open(file3)); - for (int k = 195; k < 300; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); - } - ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file3_info.file_path, file3); - ASSERT_EQ(file3_info.num_entries, 105); - ASSERT_EQ(file3_info.smallest_key, Key(195)); - ASSERT_EQ(file3_info.largest_key, Key(299)); - - // file4.sst (30 => 39) - // This file values overlap with file1 values - std::string file4 = sst_files_folder + "file4.sst"; - ASSERT_OK(sst_file_writer.Open(file4)); - for (int k = 30; k < 40; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); - } - ExternalSstFileInfo file4_info; - s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file4_info.file_path, file4); - ASSERT_EQ(file4_info.num_entries, 10); - ASSERT_EQ(file4_info.smallest_key, Key(30)); - ASSERT_EQ(file4_info.largest_key, Key(39)); - - // file5.sst (400 => 499) - std::string file5 = sst_files_folder + "file5.sst"; - ASSERT_OK(sst_file_writer.Open(file5)); - for (int k = 400; k < 500; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file5_info; - s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file5_info.file_path, file5); - ASSERT_EQ(file5_info.num_entries, 100); - ASSERT_EQ(file5_info.smallest_key, Key(400)); - ASSERT_EQ(file5_info.largest_key, Key(499)); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + // Create 3 L0 files, making score of L0 to be 3 + for (int i = 0; i < 3; i++) { + Put(Key(10 + i), std::string(5000, 'x')); + Put(Key(90 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + Flush(); + } - DestroyAndReopen(options); - // Add file using file path - s = db_->AddFile(file1); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); - for (int k = 0; k < 100; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); - } + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); - // Add file using file info - s = db_->AddFile(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); - for (int k = 0; k < 200; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); - } + // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB + // Given level multiplier 10, estimated pending compaction is around 100KB + // doesn't trigger soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - // This file have overlapping values with the exisitng data - s = db_->AddFile(file3); - ASSERT_FALSE(s.ok()) << s.ToString(); + // Create 3 L0 files, making score of L0 to be 3, higher than L0. + for (int i = 0; i < 3; i++) { + Put(Key(20 + i), std::string(5000, 'x')); + Put(Key(80 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + Flush(); + } + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); - // This file have overlapping values with the exisitng data - s = db_->AddFile(&file4_info); - ASSERT_FALSE(s.ok()) << s.ToString(); + // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB + // L2 size is 360KB, so the estimated level fanout 4, estimated pending + // compaction is around 200KB + // triggerring soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - // Overwrite values of keys divisible by 5 - for (int k = 0; k < 200; k += 5) { - ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); - } - ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); - // DB have values in memtable now, we cannot add files anymore - s = db_->AddFile(file5); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - // Make sure values are correct before and after flush/compaction - for (int i = 0; i < 2; i++) { - for (int k = 0; k < 200; k++) { - std::string value = Key(k) + "_val"; - if (k % 5 == 0) { - value += "_new"; - } - ASSERT_EQ(Get(Key(k)), value); - } - ASSERT_OK(Flush()); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - } + // shrink level base so L2 will hit soft limit easier. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_base", "5000"}, + })); + + Put("", ""); + Flush(); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - // DB sequence number is not zero, cannot add files anymore - s = db_->AddFile(file5); - ASSERT_FALSE(s.ok()) << s.ToString(); - } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | - kSkipFIFOCompaction)); + sleeping_task_low.WaitUntilSleeping(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); } -TEST_F(DBTest, AddExternalSstFileNoCopy) { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - env_->CreateDir(sst_files_folder); +TEST_F(DBTest, LastWriteBufferDelay) { Options options = CurrentOptions(); options.env = env_; - const ImmutableCFOptions ioptions(options); - - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); - - // file1.sst (0 => 99) - std::string file1 = sst_files_folder + "file1.sst"; - ASSERT_OK(sst_file_writer.Open(file1)); - for (int k = 0; k < 100; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file1_info.file_path, file1); - ASSERT_EQ(file1_info.num_entries, 100); - ASSERT_EQ(file1_info.smallest_key, Key(0)); - ASSERT_EQ(file1_info.largest_key, Key(99)); - - // file2.sst (100 => 299) - std::string file2 = sst_files_folder + "file2.sst"; - ASSERT_OK(sst_file_writer.Open(file2)); - for (int k = 100; k < 300; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val")); - } - ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file2_info.file_path, file2); - ASSERT_EQ(file2_info.num_entries, 200); - ASSERT_EQ(file2_info.smallest_key, Key(100)); - ASSERT_EQ(file2_info.largest_key, Key(299)); - - // file3.sst (110 => 124) .. overlap with file2.sst - std::string file3 = sst_files_folder + "file3.sst"; - ASSERT_OK(sst_file_writer.Open(file3)); - for (int k = 110; k < 125; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap")); - } - ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(file3_info.file_path, file3); - ASSERT_EQ(file3_info.num_entries, 15); - ASSERT_EQ(file3_info.smallest_key, Key(110)); - ASSERT_EQ(file3_info.largest_key, Key(124)); - - s = db_->AddFile(&file1_info, true /* move file */); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); - - s = db_->AddFile(&file2_info, false /* copy file */); - ASSERT_TRUE(s.ok()) << s.ToString(); - ASSERT_OK(env_->FileExists(file2)); - - // This file have overlapping values with the exisitng data - s = db_->AddFile(&file3_info, true /* move file */); - ASSERT_FALSE(s.ok()) << s.ToString(); - ASSERT_OK(env_->FileExists(file3)); - - for (int k = 0; k < 300; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); - } -} - -TEST_F(DBTest, AddExternalSstFileMultiThreaded) { - std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; - // Bulk load 10 files every file contain 1000 keys - int num_files = 10; - int keys_per_file = 1000; - - // Generate file names - std::vector file_names; - for (int i = 0; i < num_files; i++) { - std::string file_name = "file_" + ToString(i) + ".sst"; - file_names.push_back(sst_files_folder + file_name); - } - - do { - env_->CreateDir(sst_files_folder); - Options options = CurrentOptions(); - const ImmutableCFOptions ioptions(options); - - std::atomic thread_num(0); - std::function write_file_func = [&]() { - int file_idx = thread_num.fetch_add(1); - int range_start = file_idx * keys_per_file; - int range_end = range_start + keys_per_file; - - SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); - - ASSERT_OK(sst_file_writer.Open(file_names[file_idx])); - - for (int k = range_start; k < range_end; k++) { - ASSERT_OK(sst_file_writer.Add(Key(k), Key(k))); - } + options.write_buffer_size = 100000; + options.max_write_buffer_number = 4; + options.delayed_write_rate = 20000; + options.compression = kNoCompression; + options.disable_auto_compactions = true; + int kNumKeysPerMemtable = 3; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); - Status s = sst_file_writer.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); - }; - // Write num_files files in parallel - std::vector sst_writer_threads; - for (int i = 0; i < num_files; ++i) { - sst_writer_threads.emplace_back(write_file_func); - } + Reopen(options); + test::SleepingBackgroundTask sleeping_task; + // Block flushes + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + sleeping_task.WaitUntilSleeping(); - for (auto& t : sst_writer_threads) { - t.join(); + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + // Fill one mem table + for (int j = 0; j < kNumKeysPerMemtable; j++) { + Put(Key(j), ""); } + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + } + // Inserting a new entry would create a new mem table, triggering slow down. + Put(Key(0), ""); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); - fprintf(stderr, "Wrote %d files (%d keys)\n", num_files, - num_files * keys_per_file); - - thread_num.store(0); - std::atomic files_added(0); - std::function load_file_func = [&]() { - // We intentionally add every file twice, and assert that it was added - // only once and the other add failed - int thread_id = thread_num.fetch_add(1); - int file_idx = thread_id / 2; - // sometimes we use copy, sometimes link .. the result should be the same - bool move_file = (thread_id % 3 == 0); + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); +} +#endif // ROCKSDB_LITE - Status s = db_->AddFile(file_names[file_idx], move_file); - if (s.ok()) { - files_added++; - } - }; - // Bulk load num_files files in parallel - std::vector add_file_threads; - DestroyAndReopen(options); - for (int i = 0; i < num_files * 2; ++i) { - add_file_threads.emplace_back(load_file_func); +TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression, + kXpressCompression}; + for (auto comp : compressions) { + if (!CompressionTypeSupported(comp)) { + // not supported, we should fail the Open() + Options options = CurrentOptions(); + options.compression = comp; + ASSERT_TRUE(!TryReopen(options).ok()); + // Try if CreateColumnFamily also fails + options.compression = kNoCompression; + ASSERT_OK(TryReopen(options)); + ColumnFamilyOptions cf_options(options); + cf_options.compression = comp; + ColumnFamilyHandle* handle; + ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); } + } +} - for (auto& t : add_file_threads) { - t.join(); - } - ASSERT_EQ(files_added.load(), num_files); - fprintf(stderr, "Loaded %d files (%d keys)\n", num_files, - num_files * keys_per_file); - - // Overwrite values of keys divisible by 100 - for (int k = 0; k < num_files * keys_per_file; k += 100) { - std::string key = Key(k); - Status s = Put(key, key + "_new"); - ASSERT_TRUE(s.ok()); - } +#ifndef ROCKSDB_LITE +TEST_F(DBTest, RowCache) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); - for (int i = 0; i < 2; i++) { - // Make sure the values are correct before and after flush/compaction - for (int k = 0; k < num_files * keys_per_file; ++k) { - std::string key = Key(k); - std::string value = (k % 100 == 0) ? (key + "_new") : key; - ASSERT_EQ(Get(key), value); - } - ASSERT_OK(Flush()); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - } + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); - fprintf(stderr, "Verified %d values\n", num_files * keys_per_file); - } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction | - kSkipFIFOCompaction)); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); } +#endif // ROCKSDB_LITE + +TEST_F(DBTest, DeletingOldWalAfterDrop) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"}, + {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}}); + rocksdb::SyncPoint::GetInstance()->ClearTrace(); -// 1 Create some SST files by inserting K-V pairs into DB -// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file -// 3 Open DB and check if all key can be read -TEST_F(DBTest, SSTsWithLdbSuffixHandling) { + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); Options options = CurrentOptions(); - options.write_buffer_size = 110 << 10; // 110KB - options.num_levels = 4; + options.max_total_wal_size = 8192; + options.compression = kNoCompression; + options.write_buffer_size = 1 << 20; + options.level0_file_num_compaction_trigger = (1 << 30); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; DestroyAndReopen(options); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Random rnd(301); - int key_id = 0; - for (int i = 0; i < 10; ++i) { - GenerateNewFile(&rnd, &key_id, false); - } - Flush(); - Close(); - int const num_files = GetSstFileCount(dbname_); - ASSERT_GT(num_files, 0); - - std::vector filenames; - GetSstFiles(dbname_, &filenames); - int num_ldb_files = 0; - for (unsigned int i = 0; i < filenames.size(); ++i) { - if (i & 1) { - continue; - } - std::string const rdb_name = dbname_ + "/" + filenames[i]; - std::string const ldb_name = Rocks2LevelTableFileName(rdb_name); - ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok()); - ++num_ldb_files; - } - ASSERT_GT(num_ldb_files, 0); - ASSERT_EQ(num_files, GetSstFileCount(dbname_)); + CreateColumnFamilies({"cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key1", DummyString(8192))); + ASSERT_OK(Put(0, "key2", DummyString(8192))); + // the oldest wal should now be getting_flushed + ASSERT_OK(db_->DropColumnFamily(handles_[0])); + // all flushes should now do nothing because their CF is dropped + TEST_SYNC_POINT("Test:AllowFlushes"); + TEST_SYNC_POINT("Test:WaitForFlush"); + uint64_t lognum1 = dbfull()->TEST_LogfileNumber(); + ASSERT_OK(Put(1, "key3", DummyString(8192))); + ASSERT_OK(Put(1, "key4", DummyString(8192))); + // new wal should have been created + uint64_t lognum2 = dbfull()->TEST_LogfileNumber(); + EXPECT_GT(lognum2, lognum1); +} - Reopen(options); - for (int k = 0; k < key_id; ++k) { - ASSERT_NE("NOT_FOUND", Get(Key(k))); - } - Destroy(options); +TEST_F(DBTest, UnsupportedManualSync) { + DestroyAndReopen(CurrentOptions()); + env_->is_wal_sync_thread_safe_.store(false); + Status s = db_->SyncWAL(); + ASSERT_TRUE(s.IsNotSupported()); } INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam, - ::testing::Values(1, 4)); + ::testing::Combine(::testing::Values(1, 4), + ::testing::Bool())); TEST_F(DBTest, PauseBackgroundWorkTest) { - Options options; + Options options = CurrentOptions(); options.write_buffer_size = 100000; // Small write buffer - options = CurrentOptions(options); Reopen(options); std::vector threads; @@ -9749,129 +5715,10 @@ TEST_F(DBTest, PauseBackgroundWorkTest) { ASSERT_EQ(true, done.load()); } -// 1 Insert 2 K-V pairs into DB -// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 -// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 -// 4 Call Flush() to create SST -// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 -// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 -// Test both: block and plain SST -TEST_P(BloomStatsTestWithParam, BloomStatsTest) { - std::string key1("AAAA"); - std::string key2("RXDB"); // not in DB - std::string key3("ZBRA"); - std::string value1("Value1"); - std::string value3("Value3"); - - ASSERT_OK(Put(key1, value1, WriteOptions())); - ASSERT_OK(Put(key3, value3, WriteOptions())); - - // check memtable bloom stats - ASSERT_EQ(value1, Get(key1)); - ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(value3, Get(key3)); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); - - ASSERT_EQ("NOT_FOUND", Get(key2)); - ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - - // sanity checks - ASSERT_EQ(0, perf_context.bloom_sst_hit_count); - ASSERT_EQ(0, perf_context.bloom_sst_miss_count); - - Flush(); - - // sanity checks - ASSERT_EQ(0, perf_context.bloom_sst_hit_count); - ASSERT_EQ(0, perf_context.bloom_sst_miss_count); - - // check SST bloom stats - // NOTE: hits per get differs because of code paths differences - // in BlockBasedTable::Get() - int hits_per_get = use_block_table_ && !use_block_based_builder_ ? 2 : 1; - ASSERT_EQ(value1, Get(key1)); - ASSERT_EQ(hits_per_get, perf_context.bloom_sst_hit_count); - ASSERT_EQ(value3, Get(key3)); - ASSERT_EQ(2 * hits_per_get, perf_context.bloom_sst_hit_count); - - ASSERT_EQ("NOT_FOUND", Get(key2)); - ASSERT_EQ(1, perf_context.bloom_sst_miss_count); -} - -// Same scenario as in BloomStatsTest but using an iterator -TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { - std::string key1("AAAA"); - std::string key2("RXDB"); // not in DB - std::string key3("ZBRA"); - std::string value1("Value1"); - std::string value3("Value3"); - - ASSERT_OK(Put(key1, value1, WriteOptions())); - ASSERT_OK(Put(key3, value3, WriteOptions())); - - unique_ptr iter(dbfull()->NewIterator(ReadOptions())); - - // check memtable bloom stats - iter->Seek(key1); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value1, iter->value().ToString()); - ASSERT_EQ(1, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); - - iter->Seek(key3); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value3, iter->value().ToString()); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - ASSERT_EQ(0, perf_context.bloom_memtable_miss_count); - - iter->Seek(key2); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(1, perf_context.bloom_memtable_miss_count); - ASSERT_EQ(2, perf_context.bloom_memtable_hit_count); - - Flush(); - - iter.reset(dbfull()->NewIterator(ReadOptions())); - - // check SST bloom stats - iter->Seek(key1); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value1, iter->value().ToString()); - ASSERT_EQ(1, perf_context.bloom_sst_hit_count); - - iter->Seek(key3); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(value3, iter->value().ToString()); - ASSERT_EQ(2, perf_context.bloom_sst_hit_count); - - iter->Seek(key2); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(1, perf_context.bloom_sst_miss_count); - ASSERT_EQ(2, perf_context.bloom_sst_hit_count); -} - -INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, - ::testing::Values(std::make_tuple(true, true), - std::make_tuple(true, false), - std::make_tuple(false, false))); } // namespace rocksdb -#endif - int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/external/rocksdb/db/db_test2.cc b/external/rocksdb/db/db_test2.cc new file mode 100644 index 0000000000..b4e989389f --- /dev/null +++ b/external/rocksdb/db/db_test2.cc @@ -0,0 +1,1841 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/wal_filter.h" + +namespace rocksdb { + +class DBTest2 : public DBTestBase { + public: + DBTest2() : DBTestBase("/db_test2") {} +}; + +class PrefixFullBloomWithReverseComparator + : public DBTestBase, + public ::testing::WithParamInterface { + public: + PrefixFullBloomWithReverseComparator() + : DBTestBase("/prefix_bloom_reverse") {} + virtual void SetUp() override { if_cache_filter_ = GetParam(); } + bool if_cache_filter_; +}; + +TEST_P(PrefixFullBloomWithReverseComparator, + PrefixFullBloomWithReverseComparator) { + Options options = last_options_; + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + if (if_cache_filter_) { + bbto.no_block_cache = false; + bbto.cache_index_and_filter_blocks = true; + bbto.block_cache = NewLRUCache(1); + } + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3")); + + dbfull()->Flush(FlushOptions()); + + if (bbto.block_cache) { + bbto.block_cache->EraseUnRefEntries(); + } + + unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek("bar345"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar234", iter->key().ToString()); + ASSERT_EQ("foo2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar123", iter->key().ToString()); + ASSERT_EQ("foo", iter->value().ToString()); + + iter->Seek("foo234"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo123", iter->key().ToString()); + ASSERT_EQ("foo3", iter->value().ToString()); + + iter->Seek("bar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); +} + +INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator, + PrefixFullBloomWithReverseComparator, testing::Bool()); + +TEST_F(DBTest2, IteratorPropertyVersionNumber) { + Put("", ""); + Iterator* iter1 = db_->NewIterator(ReadOptions()); + std::string prop_value; + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1 = + static_cast(std::atoi(prop_value.c_str())); + + Put("", ""); + Flush(); + + Iterator* iter2 = db_->NewIterator(ReadOptions()); + ASSERT_OK( + iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number2 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_GT(version_number2, version_number1); + + Put("", ""); + + Iterator* iter3 = db_->NewIterator(ReadOptions()); + ASSERT_OK( + iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number3 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_EQ(version_number2, version_number3); + + iter1->SeekToFirst(); + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1_new = + static_cast(std::atoi(prop_value.c_str())); + ASSERT_EQ(version_number1, version_number1_new); + + delete iter1; + delete iter2; + delete iter3; +} + +TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + std::string value; + value = Get(1, "a"); +} + +#ifndef ROCKSDB_LITE +class DBTestSharedWriteBufferAcrossCFs + : public DBTestBase, + public testing::WithParamInterface { + public: + DBTestSharedWriteBufferAcrossCFs() + : DBTestBase("/db_test_shared_write_buffer") {} + void SetUp() override { use_old_interface_ = GetParam(); } + bool use_old_interface_; +}; + +TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { + Options options = CurrentOptions(); + if (use_old_interface_) { + options.db_write_buffer_size = 100000; // this is the real limit + } else { + options.write_buffer_manager.reset(new WriteBufferManager(100000)); + } + options.write_buffer_size = 500000; // this is never hit + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Trigger a flush on CF "nikitich" + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(1), DummyString(90000))); + ASSERT_OK(Put(2, Key(2), DummyString(20000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + + // "dobrynia": 20KB + // Flush 'dobrynia' + ASSERT_OK(Put(3, Key(2), DummyString(40000))); + ASSERT_OK(Put(2, Key(2), DummyString(70000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + + // "nikitich" still has data of 80KB + // Inserting Data in "dobrynia" triggers "nikitich" flushing. + ASSERT_OK(Put(3, Key(2), DummyString(40000))); + ASSERT_OK(Put(2, Key(2), DummyString(40000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // "dobrynia" still has 40KB + ASSERT_OK(Put(1, Key(2), DummyString(20000))); + ASSERT_OK(Put(0, Key(1), DummyString(10000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // This should triggers no flush + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // "default": 10KB, "pikachu": 20KB, "dobrynia": 40KB + ASSERT_OK(Put(1, Key(2), DummyString(40000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // This should triggers flush of "pikachu" + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // "default": 10KB, "dobrynia": 40KB + // Some remaining writes so 'default', 'dobrynia' and 'nikitich' flush on + // closure. + ASSERT_OK(Put(3, Key(1), DummyString(1))); + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + } +} + +INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs, + DBTestSharedWriteBufferAcrossCFs, ::testing::Bool()); + +TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { + std::string dbname2 = test::TmpDir(env_) + "/db_shared_wb_db2"; + Options options = CurrentOptions(); + options.write_buffer_size = 500000; // this is never hit + options.write_buffer_manager.reset(new WriteBufferManager(100000)); + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + ASSERT_OK(DB::Open(options, dbname2, &db2)); + + WriteOptions wo; + + // Trigger a flush on cf2 + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(90000))); + + // Insert to DB2 + ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000))); + + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + static_cast(db2)->TEST_WaitForFlushMemTable(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + + // db2: 20KB + ASSERT_OK(db2->Put(wo, Key(2), DummyString(40000))); + ASSERT_OK(db2->Put(wo, Key(3), DummyString(70000))); + ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + static_cast(db2)->TEST_WaitForFlushMemTable(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(1)); + } + + // + // Inserting Data in db2 and db_ triggers flushing in db_. + ASSERT_OK(db2->Put(wo, Key(3), DummyString(70000))); + ASSERT_OK(Put(2, Key(2), DummyString(45000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + static_cast(db2)->TEST_WaitForFlushMemTable(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(1)); + } + + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +namespace { + void ValidateKeyExistence(DB* db, const std::vector& keys_must_exist, + const std::vector& keys_must_not_exist) { + // Ensure that expected keys exist + std::vector values; + if (keys_must_exist.size() > 0) { + std::vector status_list = + db->MultiGet(ReadOptions(), keys_must_exist, &values); + for (size_t i = 0; i < keys_must_exist.size(); i++) { + ASSERT_OK(status_list[i]); + } + } + + // Ensure that given keys don't exist + if (keys_must_not_exist.size() > 0) { + std::vector status_list = + db->MultiGet(ReadOptions(), keys_must_not_exist, &values); + for (size_t i = 0; i < keys_must_not_exist.size(); i++) { + ASSERT_TRUE(status_list[i].IsNotFound()); + } + } + } + +} // namespace + +TEST_F(DBTest2, WalFilterTest) { + class TestWalFilter : public WalFilter { + private: + // Processing option that is requested to be applied at the given index + WalFilter::WalProcessingOption wal_processing_option_; + // Index at which to apply wal_processing_option_ + // At other indexes default wal_processing_option::kContinueProcessing is + // returned. + size_t apply_option_at_record_index_; + // Current record index, incremented with each record encountered. + size_t current_record_index_; + + public: + TestWalFilter(WalFilter::WalProcessingOption wal_processing_option, + size_t apply_option_for_record_index) + : wal_processing_option_(wal_processing_option), + apply_option_at_record_index_(apply_option_for_record_index), + current_record_index_(0) {} + + virtual WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + WalFilter::WalProcessingOption option_to_return; + + if (current_record_index_ == apply_option_at_record_index_) { + option_to_return = wal_processing_option_; + } + else { + option_to_return = WalProcessingOption::kContinueProcessing; + } + + // Filter is passed as a const object for RocksDB to not modify the + // object, however we modify it for our own purpose here and hence + // cast the constness away. + (const_cast(this)->current_record_index_)++; + + return option_to_return; + } + + virtual const char* Name() const override { return "TestWalFilter"; } + }; + + // Create 3 batches with two keys each + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + // Test with all WAL processing options + for (int option = 0; + option < static_cast( + WalFilter::WalProcessingOption::kWalProcessingOptionMax); + option++) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + WalFilter::WalProcessingOption wal_processing_option = + static_cast(option); + + // Create a test filter that would apply wal_processing_option at the first + // record + size_t apply_option_for_record_index = 1; + TestWalFilter test_wal_filter(wal_processing_option, + apply_option_for_record_index); + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter; + Status status = + TryReopenWithColumnFamilies({ "default", "pikachu" }, options); + if (wal_processing_option == + WalFilter::WalProcessingOption::kCorruptedRecord) { + assert(!status.ok()); + // In case of corruption we can turn off paranoid_checks to reopen + // databse + options.paranoid_checks = false; + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + } + else { + assert(status.ok()); + } + + // Compute which keys we expect to be found + // and which we expect not to be found after recovery. + std::vector keys_must_exist; + std::vector keys_must_not_exist; + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kCorruptedRecord: + case WalFilter::WalProcessingOption::kContinueProcessing: { + fprintf(stderr, "Testing with complete WAL processing\n"); + // we expect all records to be processed + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + break; + } + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: { + fprintf(stderr, + "Testing with ignoring record %" ROCKSDB_PRIszt " only\n", + apply_option_for_record_index); + // We expect the record with apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i == apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } + else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + break; + } + case WalFilter::WalProcessingOption::kStopReplay: { + fprintf(stderr, + "Testing with stopping replay from record %" ROCKSDB_PRIszt + "\n", + apply_option_for_record_index); + // We expect records beyond apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } + else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + break; + } + default: + assert(false); // unhandled case + } + + bool checked_after_reopen = false; + + while (true) { + // Ensure that expected keys exists + // and not expected keys don't exist after recovery + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + + if (checked_after_reopen) { + break; + } + + // reopen database again to make sure previous log(s) are not used + //(even if they were skipped) + // reopn database with option to use WAL filter + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + checked_after_reopen = true; + } + } +} + +TEST_F(DBTest2, WalFilterTestWithChangeBatch) { + class ChangeBatchHandler : public WriteBatch::Handler { + private: + // Batch to insert keys in + WriteBatch* new_write_batch_; + // Number of keys to add in the new batch + size_t num_keys_to_add_in_new_batch_; + // Number of keys added to new batch + size_t num_keys_added_; + + public: + ChangeBatchHandler(WriteBatch* new_write_batch, + size_t num_keys_to_add_in_new_batch) + : new_write_batch_(new_write_batch), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + num_keys_added_(0) {} + virtual void Put(const Slice& key, const Slice& value) override { + if (num_keys_added_ < num_keys_to_add_in_new_batch_) { + new_write_batch_->Put(key, value); + ++num_keys_added_; + } + } + }; + + class TestWalFilterWithChangeBatch : public WalFilter { + private: + // Index at which to start changing records + size_t change_records_from_index_; + // Number of keys to add in the new batch + size_t num_keys_to_add_in_new_batch_; + // Current record index, incremented with each record encountered. + size_t current_record_index_; + + public: + TestWalFilterWithChangeBatch(size_t change_records_from_index, + size_t num_keys_to_add_in_new_batch) + : change_records_from_index_(change_records_from_index), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + current_record_index_(0) {} + + virtual WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + if (current_record_index_ >= change_records_from_index_) { + ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_); + batch.Iterate(&handler); + *batch_changed = true; + } + + // Filter is passed as a const object for RocksDB to not modify the + // object, however we modify it for our own purpose here and hence + // cast the constness away. + (const_cast(this) + ->current_record_index_)++; + + return WalProcessingOption::kContinueProcessing; + } + + virtual const char* Name() const override { + return "TestWalFilterWithChangeBatch"; + } + }; + + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + // Create a test filter that would apply wal_processing_option at the first + // record + size_t change_records_from_index = 1; + size_t num_keys_to_add_in_new_batch = 1; + TestWalFilterWithChangeBatch test_wal_filter_with_change_batch( + change_records_from_index, num_keys_to_add_in_new_batch); + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_with_change_batch; + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + // Ensure that all keys exist before change_records_from_index_ + // And after that index only single key exists + // as our filter adds only single key for each batch + std::vector keys_must_exist; + std::vector keys_must_not_exist; + + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } + else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + + bool checked_after_reopen = false; + + while (true) { + // Ensure that expected keys exists + // and not expected keys don't exist after recovery + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + + if (checked_after_reopen) { + break; + } + + // reopen database again to make sure previous log(s) are not used + //(even if they were skipped) + // reopn database with option to use WAL filter + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + checked_after_reopen = true; + } +} + +TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { + class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter { + public: + virtual WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + *new_batch = batch; + new_batch->Put("key_extra", "value_extra"); + *batch_changed = true; + return WalProcessingOption::kContinueProcessing; + } + + virtual const char* Name() const override { + return "WalFilterTestWithChangeBatchExtraKeys"; + } + }; + + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + // Create a test filter that would add extra keys + TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys; + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_extra_keys; + Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(status.IsNotSupported()); + + // Reopen without filter, now reopen should succeed - previous + // attempt to open must not have altered the db. + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + std::vector keys_must_exist; + std::vector keys_must_not_exist; // empty vector + + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); +} + +TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { + class TestWalFilterWithColumnFamilies : public WalFilter { + private: + // column_family_id -> log_number map (provided to WALFilter) + std::map cf_log_number_map_; + // column_family_name -> column_family_id map (provided to WALFilter) + std::map cf_name_id_map_; + // column_family_name -> keys_found_in_wal map + // We store keys that are applicable to the column_family + // during recovery (i.e. aren't already flushed to SST file(s)) + // for verification against the keys we expect. + std::map> cf_wal_keys_; + public: + virtual void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map) override { + cf_log_number_map_ = cf_lognumber_map; + cf_name_id_map_ = cf_name_id_map; + } + + virtual WalProcessingOption LogRecordFound(unsigned long long log_number, + const std::string& log_file_name, + const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) override { + class LogRecordBatchHandler : public WriteBatch::Handler { + private: + const std::map & cf_log_number_map_; + std::map> & cf_wal_keys_; + unsigned long long log_number_; + public: + LogRecordBatchHandler(unsigned long long current_log_number, + const std::map & cf_log_number_map, + std::map> & cf_wal_keys) : + cf_log_number_map_(cf_log_number_map), + cf_wal_keys_(cf_wal_keys), + log_number_(current_log_number){} + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& /*value*/) override { + auto it = cf_log_number_map_.find(column_family_id); + assert(it != cf_log_number_map_.end()); + unsigned long long log_number_for_cf = it->second; + // If the current record is applicable for column_family_id + // (i.e. isn't flushed to SST file(s) for column_family_id) + // add it to the cf_wal_keys_ map for verification. + if (log_number_ >= log_number_for_cf) { + cf_wal_keys_[column_family_id].push_back(std::string(key.data(), + key.size())); + } + return Status::OK(); + } + } handler(log_number, cf_log_number_map_, cf_wal_keys_); + + batch.Iterate(&handler); + + return WalProcessingOption::kContinueProcessing; + } + + virtual const char* Name() const override { + return "WalFilterTestWithColumnFamilies"; + } + + const std::map>& GetColumnFamilyKeys() { + return cf_wal_keys_; + } + + const std::map & GetColumnFamilyNameIdMap() { + return cf_name_id_map_; + } + }; + + std::vector> batch_keys_pre_flush(3); + + batch_keys_pre_flush[0].push_back("key1"); + batch_keys_pre_flush[0].push_back("key2"); + batch_keys_pre_flush[1].push_back("key3"); + batch_keys_pre_flush[1].push_back("key4"); + batch_keys_pre_flush[2].push_back("key5"); + batch_keys_pre_flush[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { + batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024)); + batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + //Flush default column-family + db_->Flush(FlushOptions(), handles_[0]); + + // Do some more writes + std::vector> batch_keys_post_flush(3); + + batch_keys_post_flush[0].push_back("key7"); + batch_keys_post_flush[0].push_back("key8"); + batch_keys_post_flush[1].push_back("key9"); + batch_keys_post_flush[1].push_back("key10"); + batch_keys_post_flush[2].push_back("key11"); + batch_keys_post_flush[2].push_back("key12"); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024)); + batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + // On Recovery we should only find the second batch applicable to default CF + // But both batches applicable to pikachu CF + + // Create a test filter that would add extra keys + TestWalFilterWithColumnFamilies test_wal_filter_column_families; + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_column_families; + Status status = + TryReopenWithColumnFamilies({ "default", "pikachu" }, options); + ASSERT_TRUE(status.ok()); + + // verify that handles_[0] only has post_flush keys + // while handles_[1] has pre and post flush keys + auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys(); + auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap(); + size_t index = 0; + auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]]; + //default column-family, only post_flush keys are expected + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_post_flush[i][j]); + ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + } + } + ASSERT_TRUE(index == keys_cf.size()); + + index = 0; + keys_cf = cf_wal_keys[name_id_map["pikachu"]]; + //pikachu column-family, all keys are expected + for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_pre_flush[i][j]); + ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + } + } + + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_post_flush[i][j]); + ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + } + } + ASSERT_TRUE(index == keys_cf.size()); +} + +TEST_F(DBTest2, PresetCompressionDict) { + const size_t kBlockSizeBytes = 4 << 10; + const size_t kL0FileBytes = 128 << 10; + const size_t kApproxPerBlockOverheadBytes = 50; + const int kNumL0Files = 5; + + Options options; + options.arena_block_size = kBlockSizeBytes; + options.compaction_style = kCompactionStyleUniversal; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.memtable_factory.reset( + new SpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); + options.num_levels = 2; + options.target_file_size_base = kL0FileBytes; + options.target_file_size_multiplier = 2; + options.write_buffer_size = kL0FileBytes; + BlockBasedTableOptions table_options; + table_options.block_size = kBlockSizeBytes; + std::vector compression_types; + if (Zlib_Supported()) { + compression_types.push_back(kZlibCompression); + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); +#endif // LZ4_VERSION_NUMBER >= 10400 +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + compression_types.push_back(kZSTDNotFinalCompression); +#endif // ZSTD_VERSION_NUMBER >= 500 + + for (auto compression_type : compression_types) { + options.compression = compression_type; + size_t prev_out_bytes; + for (int i = 0; i < 2; ++i) { + // First iteration: compress without preset dictionary + // Second iteration: compress with preset dictionary + // To make sure the compression dictionary was actually used, we verify + // the compressed size is smaller in the second iteration. Also in the + // second iteration, verify the data we get out is the same data we put + // in. + if (i) { + options.compression_opts.max_dict_bytes = kBlockSizeBytes; + } else { + options.compression_opts.max_dict_bytes = 0; + } + + options.statistics = rocksdb::CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + std::string seq_data = + RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes); + + ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); + for (int j = 0; j < kNumL0Files; ++j) { + for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) { + ASSERT_OK(Put(1, Key(static_cast( + j * (kL0FileBytes / kBlockSizeBytes) + k)), + seq_data)); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1)); + } + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + size_t out_bytes = 0; + std::vector files; + GetSstFiles(dbname_, &files); + for (const auto& file : files) { + uint64_t curr_bytes; + env_->GetFileSize(dbname_ + "/" + file, &curr_bytes); + out_bytes += static_cast(curr_bytes); + } + + for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); + j++) { + ASSERT_EQ(seq_data, Get(1, Key(static_cast(j)))); + } + if (i) { + ASSERT_GT(prev_out_bytes, out_bytes); + } + prev_out_bytes = out_bytes; + DestroyAndReopen(options); + } + } +} + +class CompactionCompressionListener : public EventListener { + public: + explicit CompactionCompressionListener(Options* db_options) + : db_options_(db_options) {} + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + // Figure out last level with files + int bottommost_level = 0; + for (int level = 0; level < db->NumberLevels(); level++) { + std::string files_at_level; + ASSERT_TRUE( + db->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &files_at_level)); + if (files_at_level != "0") { + bottommost_level = level; + } + } + + if (db_options_->bottommost_compression != kDisableCompressionOption && + ci.output_level == bottommost_level && ci.output_level >= 2) { + ASSERT_EQ(ci.compression, db_options_->bottommost_compression); + } else if (db_options_->compression_per_level.size() != 0) { + ASSERT_EQ(ci.compression, + db_options_->compression_per_level[ci.output_level]); + } else { + ASSERT_EQ(ci.compression, db_options_->compression); + } + max_level_checked = std::max(max_level_checked, ci.output_level); + } + + int max_level_checked = 0; + const Options* db_options_; +}; + +TEST_F(DBTest2, CompressionOptions) { + if (!Zlib_Supported() || !Snappy_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 100; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 7; + options.max_background_compactions = 1; + options.base_background_compactions = 1; + + CompactionCompressionListener* listener = + new CompactionCompressionListener(&options); + options.listeners.emplace_back(listener); + + const int kKeySize = 5; + const int kValSize = 20; + Random rnd(301); + + for (int iter = 0; iter <= 2; iter++) { + listener->max_level_checked = 0; + + if (iter == 0) { + // Use different compression algorithms for different levels but + // always use Zlib for bottommost level + options.compression_per_level = {kNoCompression, kNoCompression, + kNoCompression, kSnappyCompression, + kSnappyCompression, kSnappyCompression, + kZlibCompression}; + options.compression = kNoCompression; + options.bottommost_compression = kZlibCompression; + } else if (iter == 1) { + // Use Snappy except for bottommost level use ZLib + options.compression_per_level = {}; + options.compression = kSnappyCompression; + options.bottommost_compression = kZlibCompression; + } else if (iter == 2) { + // Use Snappy everywhere + options.compression_per_level = {}; + options.compression = kSnappyCompression; + options.bottommost_compression = kDisableCompressionOption; + } + + DestroyAndReopen(options); + // Write 10 random files + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 5; j++) { + ASSERT_OK( + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValSize))); + } + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForCompact(); + } + + // Make sure that we wrote enough to check all 7 levels + ASSERT_EQ(listener->max_level_checked, 6); + } +} + +class CompactionStallTestListener : public EventListener { + public: + CompactionStallTestListener() : compacted_files_cnt_(0) {} + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); + compacted_files_cnt_ += ci.input_files.size(); + } + std::atomic compacted_files_cnt_; +}; + +TEST_F(DBTest2, CompactionStall) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"}, + {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"}, + {"DBTest2::CompactionStall:2", + "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_background_compactions = 40; + CompactionStallTestListener* listener = new CompactionStallTestListener(); + options.listeners.emplace_back(listener); + DestroyAndReopen(options); + + Random rnd(301); + + // 4 Files in L0 + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + ASSERT_OK(Flush()); + } + + // Wait for compaction to be triggered + TEST_SYNC_POINT("DBTest2::CompactionStall:0"); + + // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again + // at DBTest2::CompactionStall::1 + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + + // Another 6 L0 files to trigger compaction again + for (int i = 0; i < 6; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + ASSERT_OK(Flush()); + } + + // Wait for another compaction to be triggered + TEST_SYNC_POINT("DBTest2::CompactionStall:1"); + + // Hold NotifyOnCompactionCompleted in the unlock mutex section + TEST_SYNC_POINT("DBTest2::CompactionStall:2"); + + dbfull()->TEST_WaitForCompact(); + ASSERT_LT(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + ASSERT_GT(listener->compacted_files_cnt_.load(), + 10 - options.level0_file_num_compaction_trigger); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, FirstSnapshotTest) { + Options options; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // This snapshot will have sequence number 0 what is expected behaviour. + const Snapshot* s1 = db_->GetSnapshot(); + + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush + + db_->ReleaseSnapshot(s1); +} + +class PinL0IndexAndFilterBlocksTest : public DBTestBase, + public testing::WithParamInterface { + public: + PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {} + virtual void SetUp() override { infinite_max_files_ = GetParam(); } + + void CreateTwoLevels(Options* options) { + if (infinite_max_files_) { + options->max_open_files = -1; + } + options->create_if_missing = true; + options->statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options->table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, *options); + + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + // move this table to L1 + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + + // reset block cache + table_options.block_cache = NewLRUCache(64 * 1024); + options->table_factory.reset(NewBlockBasedTableFactory(table_options)); + TryReopenWithColumnFamilies({"default", "pikachu"}, *options); + // create new table at L0 + Put(1, "a2", "begin2"); + Put(1, "z2", "end2"); + ASSERT_OK(Flush(1)); + + table_options.block_cache->EraseUnRefEntries(); + } + + bool infinite_max_files_; +}; + +TEST_P(PinL0IndexAndFilterBlocksTest, + IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) { + Options options = CurrentOptions(); + if (infinite_max_files_) { + options.max_open_files = -1; + } + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // only index/filter were added + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + std::string value; + // Miss and hit count should remain the same, they're all pinned. + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // Miss and hit count should remain the same, they're all pinned. + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +TEST_P(PinL0IndexAndFilterBlocksTest, + MultiLevelIndexAndFilterBlocksCachedWithPinning) { + Options options = CurrentOptions(); + PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options); + // get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + std::string value; + // this should be read from L0 + // so cache values don't change + value = Get(1, "a2"); + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + // the file is opened, prefetching results in a cache filter miss + // the block is loaded and added to the cache, + // then the get results in a cache hit for L1 + // When we have inifinite max_files, there is still cache miss because we have + // reset the block cache + value = Get(1, "a"); + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); +} + +TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { + Options options = CurrentOptions(); + PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options); + + // Get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + // Reopen database. If max_open_files is set as -1, table readers will be + // preloaded. This will trigger a BlockBasedTable::Open() and prefetch + // L0 index and filter. Level 1's prefetching is disabled in DB::Open() + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + if (infinite_max_files_) { + // After reopen, cache miss are increased by one because we read (and only + // read) filter and index on L0 + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + // If max_open_files is not -1, we do not preload table readers, so there is + // no change. + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + std::string value; + // this should be read from L0 + value = Get(1, "a2"); + // If max_open_files is -1, we have pinned index and filter in Rep, so there + // will not be changes in index and filter misses or hits. If max_open_files + // is not -1, Get() will open a TableReader and prefetch index and filter. + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + value = Get(1, "a"); + if (infinite_max_files_) { + // In inifinite max files case, there's a cache miss in executing Get() + // because index and filter are not prefetched before. + ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + // In this case, cache miss will be increased by one in + // BlockBasedTable::Open() because this is not in DB::Open() code path so we + // will prefetch L1's index and filter. Cache hit will also be increased by + // one because Get() will read index and filter from the block cache + // prefetched in previous Open() call. + ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } +} + +INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest, + PinL0IndexAndFilterBlocksTest, ::testing::Bool()); +#ifndef ROCKSDB_LITE +static void UniqueIdCallback(void* arg) { + int* result = reinterpret_cast(arg); + if (*result == -1) { + *result = 0; + } + + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); +} + +class MockPersistentCache : public PersistentCache { + public: + explicit MockPersistentCache(const bool is_compressed, const size_t max_size) + : is_compressed_(is_compressed), max_size_(max_size) { + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); + } + + virtual ~MockPersistentCache() {} + + Status Insert(const Slice& page_key, const char* data, + const size_t size) override { + MutexLock _(&lock_); + + if (size_ > max_size_) { + size_ -= data_.begin()->second.size(); + data_.erase(data_.begin()); + } + + data_.insert(std::make_pair(page_key.ToString(), std::string(data, size))); + size_ += size; + return Status::OK(); + } + + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override { + MutexLock _(&lock_); + auto it = data_.find(page_key.ToString()); + if (it == data_.end()) { + return Status::NotFound(); + } + + assert(page_key.ToString() == it->first); + data->reset(new char[it->second.size()]); + memcpy(data->get(), it->second.c_str(), it->second.size()); + *size = it->second.size(); + return Status::OK(); + } + + bool IsCompressed() override { return is_compressed_; } + + port::Mutex lock_; + std::map data_; + const bool is_compressed_ = true; + size_t size_ = 0; + const size_t max_size_ = 10 * 1024; // 10KiB +}; + +TEST_F(DBTest2, PersistentCache) { + int num_iter = 80; + + Options options; + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = rocksdb::CreateDBStatistics(); + options = CurrentOptions(options); + + auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024}; + auto types = {/*compressed*/ 1, /*uncompressed*/ 0}; + for (auto bsize : bsizes) { + for (auto type : types) { + BlockBasedTableOptions table_options; + table_options.persistent_cache.reset( + new MockPersistentCache(type, 10 * 1024)); + table_options.no_block_cache = true; + table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT); + auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS); + + ASSERT_GT(hit, 0); + ASSERT_GT(miss, 0); + } + } +} + +namespace { +void CountSyncPoint() { + TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */); +} +} // namespace + +TEST_F(DBTest2, SyncPointMarker) { + std::atomic sync_point_called(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBTest2::MarkedPoint", + [&](void* arg) { sync_point_called.fetch_add(1); }); + + // The first dependency enforces Marker can be loaded before MarkedPoint. + // The second checks that thread 1's MarkedPoint should be disabled here. + // Execution order: + // | Thread 1 | Thread 2 | + // | | Marker | + // | MarkedPoint | | + // | Thread1First | | + // | | MarkedPoint | + rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}}, + {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}}); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + std::function func1 = [&]() { + CountSyncPoint(); + TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First"); + }; + + std::function func2 = [&]() { + TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker"); + CountSyncPoint(); + }; + + auto thread1 = std::thread(func1); + auto thread2 = std::thread(func2); + thread1.join(); + thread2.join(); + + // Callback is only executed once + ASSERT_EQ(sync_point_called.load(), 1); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif + +class MergeOperatorPinningTest : public DBTest2, + public testing::WithParamInterface { + public: + MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest, + ::testing::Bool()); + +#ifndef ROCKSDB_LITE +TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; // every block will contain one entry + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + const int kKeysPerFile = 10; + const int kOperandsPerKeyPerFile = 7; + const int kOperandSize = 100; + // Filse to write in L0 before compacting to lower level + const int kFilesPerLevel = 3; + + Random rnd(301); + std::map true_data; + int batch_num = 1; + int lvl_to_fill = 4; + int key_id = 0; + while (true) { + for (int j = 0; j < kKeysPerFile; j++) { + std::string key = Key(key_id % 35); + key_id++; + for (int k = 0; k < kOperandsPerKeyPerFile; k++) { + std::string val = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, val)); + if (true_data[key].size() == 0) { + true_data[key] = val; + } else { + true_data[key] += "," + val; + } + } + } + + if (lvl_to_fill == -1) { + // Keep last batch in memtable and stop + break; + } + + ASSERT_OK(Flush()); + if (batch_num % kFilesPerLevel == 0) { + if (lvl_to_fill != 0) { + MoveFilesToLevel(lvl_to_fill); + } + lvl_to_fill--; + } + batch_num++; + } + + // 3 L0 files + // 1 L1 file + // 3 L2 files + // 1 L3 file + // 3 L4 Files + ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3"); + + // Verify Get() + for (auto kv : true_data) { + ASSERT_EQ(Get(kv.first), kv.second); + } + + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Verify Iterator::Next() + auto data_iter = true_data.begin(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + ASSERT_EQ(data_iter, true_data.end()); + + // Verify Iterator::Prev() + auto data_rev = true_data.rbegin(); + for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) { + ASSERT_EQ(iter->key().ToString(), data_rev->first); + ASSERT_EQ(iter->value().ToString(), data_rev->second); + } + ASSERT_EQ(data_rev, true_data.rend()); + + // Verify Iterator::Seek() + for (auto kv : true_data) { + iter->Seek(kv.first); + ASSERT_EQ(kv.first, iter->key().ToString()); + ASSERT_EQ(kv.second, iter->value().ToString()); + } + + delete iter; +} + +TEST_P(MergeOperatorPinningTest, Randomized) { + do { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions table_options; + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + + const int kTotalMerges = 10000; + // Every key gets ~10 operands + const int kKeyRange = kTotalMerges / 10; + const int kOperandSize = 20; + const int kNumPutBefore = kKeyRange / 10; // 10% value + const int kNumPutAfter = kKeyRange / 10; // 10% overwrite + const int kNumDelete = kKeyRange / 10; // 10% delete + + // kNumPutBefore keys will have base values + for (int i = 0; i < kNumPutBefore; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Do kTotalMerges merges + for (int i = 0; i < kTotalMerges; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + + if (true_data[key] < value) { + true_data[key] = value; + } + } + + // Overwrite random kNumPutAfter keys + for (int i = 0; i < kNumPutAfter; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Delete random kNumDelete keys + for (int i = 0; i < kNumDelete; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + ASSERT_OK(db_->Delete(WriteOptions(), key)); + + true_data.erase(key); + } + + VerifyDBFromMap(true_data); + + // Skip HashCuckoo since it does not support merge operators + } while (ChangeOptions(kSkipMergePut | kSkipHashCuckoo)); +} + +class MergeOperatorHook : public MergeOperator { + public: + explicit MergeOperatorHook(std::shared_ptr _merge_op) + : merge_op_(_merge_op) {} + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + before_merge_(); + bool res = merge_op_->FullMergeV2(merge_in, merge_out); + after_merge_(); + return res; + } + + virtual const char* Name() const override { return merge_op_->Name(); } + + std::shared_ptr merge_op_; + std::function before_merge_ = []() {}; + std::function after_merge_ = []() {}; +}; + +TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) { + Options options = CurrentOptions(); + + auto merge_hook = + std::make_shared(MergeOperators::CreateMaxOperator()); + options.merge_operator = merge_hook; + options.disable_auto_compactions = true; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.max_open_files = 20; + BlockBasedTableOptions bbto; + bbto.no_block_cache = disable_block_cache_; + if (bbto.no_block_cache == false) { + bbto.block_cache = NewLRUCache(64 * 1024 * 1024); + } else { + bbto.block_cache = nullptr; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const int kNumOperands = 30; + const int kNumKeys = 1000; + const int kOperandSize = 100; + Random rnd(301); + + // 1000 keys every key have 30 operands, every operand is in a different file + std::map true_data; + for (int i = 0; i < kNumOperands; i++) { + for (int j = 0; j < kNumKeys; j++) { + std::string k = Key(j); + std::string v = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + + true_data[k] = std::max(true_data[k], v); + } + ASSERT_OK(Flush()); + } + + std::vector file_numbers = ListTableFiles(env_, dbname_); + ASSERT_EQ(file_numbers.size(), kNumOperands); + int merge_cnt = 0; + + // Code executed before merge operation + merge_hook->before_merge_ = [&]() { + // Evict all tables from cache before every merge operation + for (uint64_t num : file_numbers) { + TableCache::Evict(dbfull()->TEST_table_cache(), num); + } + // Decrease cache capacity to force all unrefed blocks to be evicted + if (bbto.block_cache) { + bbto.block_cache->SetCapacity(1); + } + merge_cnt++; + }; + + // Code executed after merge operation + merge_hook->after_merge_ = [&]() { + // Increase capacity again after doing the merge + if (bbto.block_cache) { + bbto.block_cache->SetCapacity(64 * 1024 * 1024); + } + }; + + VerifyDBFromMap(true_data); + ASSERT_EQ(merge_cnt, kNumKeys * 4 /* get + next + prev + seek */); + + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + VerifyDBFromMap(true_data); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, MaxSuccessiveMergesInRecovery) { + Options options; + options = CurrentOptions(options); + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + db_->Put(WriteOptions(), "foo", "bar"); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar")); + + options.max_successive_merges = 3; + Reopen(options); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/util/db_test_util.cc b/external/rocksdb/db/db_test_util.cc similarity index 72% rename from external/rocksdb/util/db_test_util.cc rename to external/rocksdb/db/db_test_util.cc index 6b494eaece..7ab9b3d109 100644 --- a/external/rocksdb/util/db_test_util.cc +++ b/external/rocksdb/db/db_test_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/db_test_util.h" +#include "db/db_test_util.h" namespace rocksdb { @@ -18,6 +18,7 @@ SpecialEnv::SpecialEnv(Env* base) rnd_(301), sleep_counter_(this), addon_time_(0), + time_elapse_only_sleep_(false), no_sleep_(false) { delay_sstable_sync_.store(false, std::memory_order_release); drop_writes_.store(false, std::memory_order_release); @@ -29,6 +30,8 @@ SpecialEnv::SpecialEnv(Env* base) manifest_write_error_.store(false, std::memory_order_release); log_write_error_.store(false, std::memory_order_release); random_file_open_counter_.store(0, std::memory_order_relaxed); + delete_count_.store(0, std::memory_order_relaxed); + num_open_wal_file_.store(0); log_write_slowdown_ = 0; bytes_written_ = 0; sync_counter_ = 0; @@ -38,11 +41,10 @@ SpecialEnv::SpecialEnv(Env* base) table_write_callback_ = nullptr; } - -DBTestBase::DBTestBase(const std::string path) : option_config_(kDefault), - mem_env_(!getenv("MEM_ENV") ? nullptr : - new MockEnv(Env::Default())), - env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) { +DBTestBase::DBTestBase(const std::string path) + : option_config_(kDefault), + mem_env_(!getenv("MEM_ENV") ? nullptr : new MockEnv(Env::Default())), + env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) { env_->SetBackgroundThreads(1, Env::LOW); env_->SetBackgroundThreads(1, Env::HIGH); dbname_ = test::TmpDir(env_) + path; @@ -56,15 +58,13 @@ DBTestBase::DBTestBase(const std::string path) : option_config_(kDefault), EXPECT_OK(DestroyDB(dbname_, options)); db_ = nullptr; Reopen(options); + Random::GetTLSInstance()->Reset(0xdeadbeef); } DBTestBase::~DBTestBase() { -// SyncPoint is not supported in Released Windows Mode. -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->LoadDependency({}); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#endif // !(defined NDEBUG) || !defined(OS_WIN) Close(); Options options; options.db_paths.emplace_back(dbname_, 0); @@ -75,48 +75,65 @@ DBTestBase::~DBTestBase() { delete env_; } -// Switch to a fresh database with the next option configuration to -// test. Return false if there are no more configurations to test. -bool DBTestBase::ChangeOptions(int skip_mask) { - for (option_config_++; option_config_ < kEnd; option_config_++) { - if ((skip_mask & kSkipDeletesFilterFirst) && - option_config_ == kDeletesFilterFirst) { - continue; +bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { +#ifdef ROCKSDB_LITE + // These options are not supported in ROCKSDB_LITE + if (option_config == kHashSkipList || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap || + option_config == kPlainTableAllBytesPrefix || + option_config == kVectorRep || option_config == kHashLinkList || + option_config == kHashCuckoo || option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions || + option_config == kFIFOCompaction || + option_config == kConcurrentSkipList) { + return true; } +#endif + if ((skip_mask & kSkipUniversalCompaction) && - (option_config_ == kUniversalCompaction || - option_config_ == kUniversalCompactionMultiLevel)) { - continue; + (option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel)) { + return true; } - if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) { - continue; + if ((skip_mask & kSkipMergePut) && option_config == kMergePut) { + return true; } if ((skip_mask & kSkipNoSeekToLast) && - (option_config_ == kHashLinkList || - option_config_ == kHashSkipList)) {; - continue; + (option_config == kHashLinkList || option_config == kHashSkipList)) { + return true; } if ((skip_mask & kSkipPlainTable) && - (option_config_ == kPlainTableAllBytesPrefix || - option_config_ == kPlainTableFirstBytePrefix || - option_config_ == kPlainTableCappedPrefix || - option_config_ == kPlainTableCappedPrefixNonMmap)) { - continue; + (option_config == kPlainTableAllBytesPrefix || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap)) { + return true; } if ((skip_mask & kSkipHashIndex) && - (option_config_ == kBlockBasedTableWithPrefixHashIndex || - option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) { - continue; + (option_config == kBlockBasedTableWithPrefixHashIndex || + option_config == kBlockBasedTableWithWholeKeyHashIndex)) { + return true; } - if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) { - continue; + if ((skip_mask & kSkipHashCuckoo) && (option_config == kHashCuckoo)) { + return true; } - if ((skip_mask & kSkipFIFOCompaction) && - option_config_ == kFIFOCompaction) { - continue; + if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) { + return true; } - if ((skip_mask & kSkipMmapReads) && - option_config_ == kWalDirAndMmapReads) { + if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) { + return true; + } + return false; +} + +// Switch to a fresh database with the next option configuration to +// test. Return false if there are no more configurations to test. +bool DBTestBase::ChangeOptions(int skip_mask) { + for (option_config_++; option_config_ < kEnd; option_config_++) { + if (ShouldSkipOptions(option_config_, skip_mask)) { continue; } break; @@ -191,6 +208,13 @@ Options DBTestBase::CurrentOptions( const anon::OptionsOverride& options_override) { Options options; options.write_buffer_size = 4090 * 4096; + options.target_file_size_base = 2 * 1024 * 1024; + options.max_bytes_for_level_base = 10 * 1024 * 1024; + options.max_open_files = 5000; + options.base_background_compactions = -1; + options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + options.compaction_pri = CompactionPri::kByCompensatedSize; + return CurrentOptions(options, options_override); } @@ -205,10 +229,10 @@ Options DBTestBase::CurrentOptions( BlockBasedTableOptions table_options; bool set_block_based_table_factory = true; switch (option_config_) { +#ifndef ROCKSDB_LITE case kHashSkipList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.memtable_factory.reset( - NewHashSkipListRepFactory(16)); + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); break; case kPlainTableFirstBytePrefix: options.table_factory.reset(new PlainTableFactory()); @@ -238,6 +262,19 @@ Options DBTestBase::CurrentOptions( options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory(100)); + break; + case kHashLinkList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + break; + case kHashCuckoo: + options.memtable_factory.reset( + NewHashCuckooRepFactory(options.write_buffer_size)); + break; +#endif // ROCKSDB_LITE case kMergePut: options.merge_operator = MergeOperators::CreatePutOperator(); break; @@ -269,23 +306,9 @@ Options DBTestBase::CurrentOptions( case kPerfOptions: options.soft_rate_limit = 2.0; options.delayed_write_rate = 8 * 1024 * 1024; + options.report_bg_io_stats = true; // TODO(3.13) -- test more options break; - case kDeletesFilterFirst: - options.filter_deletes = true; - break; - case kVectorRep: - options.memtable_factory.reset(new VectorRepFactory(100)); - break; - case kHashLinkList: - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.memtable_factory.reset( - NewHashLinkListRepFactory(4, 0, 3, true, 4)); - break; - case kHashCuckoo: - options.memtable_factory.reset( - NewHashCuckooRepFactory(options.write_buffer_size)); - break; case kUniversalCompaction: options.compaction_style = kCompactionStyleUniversal; options.num_levels = 1; @@ -296,7 +319,7 @@ Options DBTestBase::CurrentOptions( break; case kCompressedBlockCache: options.allow_mmap_writes = true; - table_options.block_cache_compressed = NewLRUCache(8*1024*1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); break; case kInfiniteMaxOpenFiles: options.max_open_files = -1; @@ -319,6 +342,10 @@ Options DBTestBase::CurrentOptions( options.prefix_extractor.reset(NewNoopTransform()); break; } + case kBlockBasedTableWithIndexRestartInterval: { + table_options.index_block_restart_interval = 8; + break; + } case kOptimizeFiltersForHits: { options.optimize_filters_for_hits = true; set_block_based_table_factory = true; @@ -328,6 +355,10 @@ Options DBTestBase::CurrentOptions( options.row_cache = NewLRUCache(1024 * 1024); break; } + case kRecycleLogFiles: { + options.recycle_log_file_num = 2; + break; + } case kLevelSubcompactions: { options.max_subcompactions = 4; break; @@ -338,6 +369,11 @@ Options DBTestBase::CurrentOptions( options.max_subcompactions = 4; break; } + case kConcurrentSkipList: { + options.allow_concurrent_memtable_write = true; + options.enable_write_thread_adaptive_yield = true; + break; + } default: break; @@ -351,11 +387,12 @@ Options DBTestBase::CurrentOptions( } options.env = env_; options.create_if_missing = true; + options.fail_if_options_file_error = true; return options; } void DBTestBase::CreateColumnFamilies(const std::vector& cfs, - const Options& options) { + const Options& options) { ColumnFamilyOptions cf_opts(options); size_t cfi = handles_.size(); handles_.resize(cfi + cfs.size()); @@ -365,7 +402,7 @@ void DBTestBase::CreateColumnFamilies(const std::vector& cfs, } void DBTestBase::CreateAndReopenWithCF(const std::vector& cfs, - const Options& options) { + const Options& options) { CreateColumnFamilies(cfs, options); std::vector cfs_plus_default = cfs; cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); @@ -373,18 +410,17 @@ void DBTestBase::CreateAndReopenWithCF(const std::vector& cfs, } void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, - const std::vector& options) { + const std::vector& options) { ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, - const Options& options) { + const Options& options) { ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } Status DBTestBase::TryReopenWithColumnFamilies( - const std::vector& cfs, - const std::vector& options) { + const std::vector& cfs, const std::vector& options) { Close(); EXPECT_EQ(cfs.size(), options.size()); std::vector column_families; @@ -396,8 +432,7 @@ Status DBTestBase::TryReopenWithColumnFamilies( } Status DBTestBase::TryReopenWithColumnFamilies( - const std::vector& cfs, - const Options& options) { + const std::vector& cfs, const Options& options) { Close(); std::vector v_opts(cfs.size(), options); return TryReopenWithColumnFamilies(cfs, v_opts); @@ -409,7 +444,7 @@ void DBTestBase::Reopen(const Options& options) { void DBTestBase::Close() { for (auto h : handles_) { - delete h; + db_->DestroyColumnFamilyHandle(h); } handles_.clear(); delete db_; @@ -454,7 +489,7 @@ Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) { } Status DBTestBase::Put(int cf, const Slice& k, const Slice& v, - WriteOptions wo) { + WriteOptions wo) { if (kMergePut == option_config_) { return db_->Merge(wo, handles_[cf], k, v); } else { @@ -493,7 +528,7 @@ std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) { } std::string DBTestBase::Get(int cf, const std::string& k, - const Snapshot* snapshot) { + const Snapshot* snapshot) { ReadOptions options; options.verify_checksums = true; options.snapshot = snapshot; @@ -552,9 +587,9 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { Arena arena; ScopedArenaIterator iter; if (cf == 0) { - iter.set(dbfull()->TEST_NewInternalIterator(&arena)); + iter.set(dbfull()->NewInternalIterator(&arena)); } else { - iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); + iter.set(dbfull()->NewInternalIterator(&arena, handles_[cf])); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); @@ -605,6 +640,7 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { return result; } +#ifndef ROCKSDB_LITE int DBTestBase::NumSortedRuns(int cf) { ColumnFamilyMetaData cf_meta; if (cf == 0) { @@ -631,20 +667,6 @@ uint64_t DBTestBase::TotalSize(int cf) { return cf_meta.size; } -int DBTestBase::NumTableFilesAtLevel(int level, int cf) { - std::string property; - if (cf == 0) { - // default cfd - EXPECT_TRUE(db_->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(level), &property)); - } else { - EXPECT_TRUE(db_->GetProperty( - handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), - &property)); - } - return atoi(property.c_str()); -} - uint64_t DBTestBase::SizeAtLevel(int level) { std::vector metadata; db_->GetLiveFilesMetaData(&metadata); @@ -657,20 +679,57 @@ uint64_t DBTestBase::SizeAtLevel(int level) { return sum; } -int DBTestBase::TotalLiveFiles(int cf) { +size_t DBTestBase::TotalLiveFiles(int cf) { ColumnFamilyMetaData cf_meta; if (cf == 0) { db_->GetColumnFamilyMetaData(&cf_meta); } else { db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); } - int num_files = 0; + size_t num_files = 0; for (auto& level : cf_meta.levels) { num_files += level.files.size(); } return num_files; } +size_t DBTestBase::CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return metadata.size(); +} +#endif // ROCKSDB_LITE + +int DBTestBase::NumTableFilesAtLevel(int level, int cf) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + &property)); + } + return atoi(property.c_str()); +} + +double DBTestBase::CompressionRatioAtLevel(int level, int cf) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.compression-ratio-at-level" + NumberToString(level), + &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], + "rocksdb.compression-ratio-at-level" + NumberToString(level), + &property)); + } + return std::stod(property); +} + int DBTestBase::TotalTableFiles(int cf, int levels) { if (levels == -1) { levels = CurrentOptions().num_levels; @@ -713,12 +772,6 @@ size_t DBTestBase::CountFiles() { return files.size() + logfiles.size(); } -size_t DBTestBase::CountLiveFiles() { - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - return metadata.size(); -} - uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) { Range r(start, limit); uint64_t size; @@ -731,7 +784,7 @@ uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) { } void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit, - uint32_t target_path_id) { + uint32_t target_path_id) { CompactRangeOptions compact_options; compact_options.target_path_id = target_path_id; ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); @@ -748,9 +801,8 @@ void DBTestBase::Compact(const Slice& start, const Slice& limit) { // Do n memtable compactions, each of which produces an sstable // covering the range [small,large]. -void DBTestBase::MakeTables( - int n, const std::string& small, - const std::string& large, int cf) { +void DBTestBase::MakeTables(int n, const std::string& small, + const std::string& large, int cf) { for (int i = 0; i < n; i++) { ASSERT_OK(Put(cf, small, "begin")); ASSERT_OK(Put(cf, large, "end")); @@ -761,8 +813,8 @@ void DBTestBase::MakeTables( // Prevent pushing of new sstables into deeper levels by adding // tables that cover a specified range to all levels. -void DBTestBase::FillLevels( - const std::string& smallest, const std::string& largest, int cf) { +void DBTestBase::FillLevels(const std::string& smallest, + const std::string& largest, int cf) { MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf); } @@ -779,7 +831,7 @@ void DBTestBase::MoveFilesToLevel(int level, int cf) { void DBTestBase::DumpFileCounts(const char* label) { fprintf(stderr, "---\n%s:\n", label); fprintf(stderr, "maxoverlap: %" PRIu64 "\n", - dbfull()->TEST_MaxNextLevelOverlappingBytes()); + dbfull()->TEST_MaxNextLevelOverlappingBytes()); for (int level = 0; level < db_->NumberLevels(); level++) { int num = NumTableFilesAtLevel(level); if (num > 0) { @@ -815,7 +867,7 @@ int DBTestBase::GetSstFileCount(std::string path) { // this will generate non-overlapping files since it keeps increasing key_idx void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx, bool nowait) { - for (int i = 0; i < 100; i++) { + for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990))); (*key_idx)++; } @@ -827,7 +879,7 @@ void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx, // this will generate non-overlapping files since it keeps increasing key_idx void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) { - for (int i = 0; i < 100; i++) { + for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990))); (*key_idx)++; } @@ -837,8 +889,10 @@ void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) { } } +const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51; + void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) { - for (int i = 0; i < 51; i++) { + for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) { ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000))); } ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200))); @@ -888,9 +942,10 @@ void DBTestBase::VerifyIterLast(std::string expected_key, int cf) { // sets newValue with delta // If previous value is not empty, // updates previous value with 'b' string of previous value size - 1. -UpdateStatus DBTestBase::updateInPlaceSmallerSize( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue) { +UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { if (prevValue == nullptr) { *newValue = std::string(delta.size(), 'c'); return UpdateStatus::UPDATED; @@ -902,9 +957,10 @@ UpdateStatus DBTestBase::updateInPlaceSmallerSize( } } -UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue) { +UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { if (prevValue == nullptr) { *newValue = std::string(delta.size(), 'c'); return UpdateStatus::UPDATED; @@ -916,16 +972,17 @@ UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize( } } -UpdateStatus DBTestBase::updateInPlaceLargerSize( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue) { +UpdateStatus DBTestBase::updateInPlaceLargerSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { *newValue = std::string(delta.size(), 'c'); return UpdateStatus::UPDATED; } -UpdateStatus DBTestBase::updateInPlaceNoAction( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue) { +UpdateStatus DBTestBase::updateInPlaceNoAction(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue) { return UpdateStatus::UPDATE_FAILED; } @@ -934,9 +991,9 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) { ScopedArenaIterator iter; Arena arena; if (cf != 0) { - iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); + iter.set(dbfull()->NewInternalIterator(&arena, handles_[cf])); } else { - iter.set(dbfull()->TEST_NewInternalIterator(&arena)); + iter.set(dbfull()->NewInternalIterator(&arena)); } iter->SeekToFirst(); ASSERT_EQ(iter->status().ok(), true); @@ -953,9 +1010,8 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) { ASSERT_EQ(0, seq); } -void DBTestBase::CopyFile( - const std::string& source, const std::string& destination, - uint64_t size) { +void DBTestBase::CopyFile(const std::string& source, + const std::string& destination, uint64_t size) { const EnvOptions soptions; unique_ptr srcfile; ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); @@ -978,4 +1034,122 @@ void DBTestBase::CopyFile( ASSERT_OK(destfile->Close()); } +std::unordered_map DBTestBase::GetAllSSTFiles( + uint64_t* total_size) { + std::unordered_map res; + + if (total_size) { + *total_size = 0; + } + std::vector files; + env_->GetChildren(dbname_, &files); + for (auto& file_name : files) { + uint64_t number; + FileType type; + std::string file_path = dbname_ + "/" + file_name; + if (ParseFileName(file_name, &number, &type) && type == kTableFile) { + uint64_t file_size = 0; + env_->GetFileSize(file_path, &file_size); + res[file_path] = file_size; + if (total_size) { + *total_size += file_size; + } + } + } + return res; +} + +std::vector DBTestBase::ListTableFiles(Env* env, + const std::string& path) { + std::vector files; + std::vector file_numbers; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kTableFile) { + file_numbers.push_back(number); + } + } + } + return file_numbers; +} + +void DBTestBase::VerifyDBFromMap(std::map true_data) { + for (auto& kv : true_data) { + ASSERT_EQ(Get(kv.first), kv.second); + } + + ReadOptions ro; + ro.total_order_seek = true; + Iterator* iter = db_->NewIterator(ro); + // Verify Iterator::Next() + auto data_iter = true_data.begin(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + ASSERT_EQ(data_iter, true_data.end()); + + // Verify Iterator::Prev() + auto data_rev = true_data.rbegin(); + for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) { + ASSERT_EQ(iter->key().ToString(), data_rev->first); + ASSERT_EQ(iter->value().ToString(), data_rev->second); + } + ASSERT_EQ(data_rev, true_data.rend()); + + // Verify Iterator::Seek() + for (auto kv : true_data) { + iter->Seek(kv.first); + ASSERT_EQ(kv.first, iter->key().ToString()); + ASSERT_EQ(kv.second, iter->value().ToString()); + } + + delete iter; +} + +#ifndef ROCKSDB_LITE + +Status DBTestBase::GenerateAndAddExternalFile(const Options options, + std::vector keys, + size_t file_id) { + std::string file_path = + test::TmpDir(env_) + "/sst_files/" + ToString(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator); + + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (auto& entry : keys) { + std::string k = Key(entry); + std::string v = k + ToString(file_id); + s = sst_file_writer.Add(k, v); + if (!s.ok()) { + return s; + } + } + s = sst_file_writer.Finish(); + + if (s.ok()) { + s = db_->AddFile(std::vector(1, file_path)); + } + + return s; +} + +uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily( + DB* db, std::string column_family_name) { + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + uint64_t result = 0; + for (auto& fileMetadata : metadata) { + result += (fileMetadata.column_family_name == column_family_name); + } + return result; +} +#endif // ROCKSDB_LITE + } // namespace rocksdb diff --git a/external/rocksdb/util/db_test_util.h b/external/rocksdb/db/db_test_util.h similarity index 70% rename from external/rocksdb/util/db_test_util.h rename to external/rocksdb/db/db_test_util.h index 774cce8748..237dc51e43 100644 --- a/external/rocksdb/util/db_test_util.h +++ b/external/rocksdb/db/db_test_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,7 @@ #endif #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "db/db_impl.h" #include "db/dbformat.h" #include "db/filename.h" +#include "memtable/hash_linklist_rep.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" @@ -37,17 +39,18 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/utilities/checkpoint.h" #include "table/block_based_table_factory.h" #include "table/mock_table.h" #include "table/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" #include "util/compression.h" -#include "util/db_test_util.h" -#include "util/hash_linklist_rep.h" #include "util/mock_env.h" #include "util/mutexlock.h" -#include "util/scoped_arena_iterator.h" + #include "util/string_util.h" // SyncPoint is not supported in Released Windows Mode. #if !(defined NDEBUG) || !defined(OS_WIN) @@ -117,6 +120,84 @@ struct OptionsOverride { } // namespace anon +// A hacky skip list mem table that triggers flush after number of entries. +class SpecialMemTableRep : public MemTableRep { + public: + explicit SpecialMemTableRep(MemTableAllocator* allocator, + MemTableRep* memtable, int num_entries_flush) + : MemTableRep(allocator), + memtable_(memtable), + num_entries_flush_(num_entries_flush), + num_entries_(0) {} + + virtual KeyHandle Allocate(const size_t len, char** buf) override { + return memtable_->Allocate(len, buf); + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + virtual void Insert(KeyHandle handle) override { + memtable_->Insert(handle); + num_entries_++; + } + + // Returns true iff an entry that compares equal to key is in the list. + virtual bool Contains(const char* key) const override { + return memtable_->Contains(key); + } + + virtual size_t ApproximateMemoryUsage() override { + // Return a high memory usage when number of entries exceeds the threshold + // to trigger a flush. + return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024; + } + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override { + memtable_->Get(k, callback_args, callback_func); + } + + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) override { + return memtable_->ApproximateNumEntries(start_ikey, end_ikey); + } + + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + return memtable_->GetIterator(arena); + } + + virtual ~SpecialMemTableRep() override {} + + private: + unique_ptr memtable_; + int num_entries_flush_; + int num_entries_; +}; + +// The factory for the hacky skip list mem table that triggers flush after +// number of entries exceeds a threshold. +class SpecialSkipListFactory : public MemTableRepFactory { + public: + // After number of inserts exceeds `num_entries_flush` in a mem table, trigger + // flush. + explicit SpecialSkipListFactory(int num_entries_flush) + : num_entries_flush_(num_entries_flush) {} + + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, + const SliceTransform* transform, Logger* logger) override { + return new SpecialMemTableRep( + allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0), + num_entries_flush_); + } + virtual const char* Name() const override { return "SkipListFactory"; } + + private: + SkipListFactory factory_; + int num_entries_flush_; +}; + // Special Env used to delay background operations class SpecialEnv : public EnvWrapper { public: @@ -131,9 +212,7 @@ class SpecialEnv : public EnvWrapper { public: SSTableFile(SpecialEnv* env, unique_ptr&& base) - : env_(env), - base_(std::move(base)) { - } + : env_(env), base_(std::move(base)) {} Status Append(const Slice& data) override { if (env_->table_write_callback_) { (*env_->table_write_callback_)(); @@ -148,9 +227,7 @@ class SpecialEnv : public EnvWrapper { return base_->Append(data); } } - Status Truncate(uint64_t size) override { - return base_->Truncate(size); - } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } Status Close() override { // SyncPoint is not supported in Released Windows Mode. #if !(defined NDEBUG) || !defined(OS_WIN) @@ -180,7 +257,7 @@ class SpecialEnv : public EnvWrapper { class ManifestFile : public WritableFile { public: ManifestFile(SpecialEnv* env, unique_ptr&& b) - : env_(env), base_(std::move(b)) { } + : env_(env), base_(std::move(b)) {} Status Append(const Slice& data) override { if (env_->manifest_write_error_.load(std::memory_order_acquire)) { return Status::IOError("simulated writer error"); @@ -208,7 +285,10 @@ class SpecialEnv : public EnvWrapper { class WalFile : public WritableFile { public: WalFile(SpecialEnv* env, unique_ptr&& b) - : env_(env), base_(std::move(b)) {} + : env_(env), base_(std::move(b)) { + env_->num_open_wal_file_.fetch_add(1); + } + virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); } Status Append(const Slice& data) override { #if !(defined NDEBUG) || !defined(OS_WIN) TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1"); @@ -282,24 +362,30 @@ class SpecialEnv : public EnvWrapper { class CountingFile : public RandomAccessFile { public: CountingFile(unique_ptr&& target, - anon::AtomicCounter* counter) - : target_(std::move(target)), counter_(counter) { - } + anon::AtomicCounter* counter, + std::atomic* bytes_read) + : target_(std::move(target)), + counter_(counter), + bytes_read_(bytes_read) {} virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { counter_->Increment(); - return target_->Read(offset, n, result, scratch); + Status s = target_->Read(offset, n, result, scratch); + *bytes_read_ += result->size(); + return s; } private: unique_ptr target_; anon::AtomicCounter* counter_; + std::atomic* bytes_read_; }; Status s = target()->NewRandomAccessFile(f, r, soptions); random_file_open_counter_++; if (s.ok() && count_random_reads_) { - r->reset(new CountingFile(std::move(*r), &random_read_counter_)); + r->reset(new CountingFile(std::move(*r), &random_read_counter_, + &random_read_bytes_counter_)); } return s; } @@ -329,18 +415,21 @@ class SpecialEnv : public EnvWrapper { return s; } - virtual void SleepForMicroseconds(int micros) override { sleep_counter_.Increment(); - if (no_sleep_) { + if (no_sleep_ || time_elapse_only_sleep_) { addon_time_.fetch_add(micros); - } else { + } + if (!no_sleep_) { target()->SleepForMicroseconds(micros); } } virtual Status GetCurrentTime(int64_t* unix_time) override { - Status s = target()->GetCurrentTime(unix_time); + Status s; + if (!time_elapse_only_sleep_) { + s = target()->GetCurrentTime(unix_time); + } if (s.ok()) { *unix_time += addon_time_.load(); } @@ -348,11 +437,18 @@ class SpecialEnv : public EnvWrapper { } virtual uint64_t NowNanos() override { - return target()->NowNanos() + addon_time_.load() * 1000; + return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + + addon_time_.load() * 1000; } virtual uint64_t NowMicros() override { - return target()->NowMicros() + addon_time_.load(); + return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) + + addon_time_.load(); + } + + virtual Status DeleteFile(const std::string& fname) override { + delete_count_.fetch_add(1); + return target()->DeleteFile(fname); } Random rnd_; @@ -382,8 +478,12 @@ class SpecialEnv : public EnvWrapper { // Slow down every log write, in micro-seconds. std::atomic log_write_slowdown_; + // Number of WAL files that are still open for write. + std::atomic num_open_wal_file_; + bool count_random_reads_; anon::AtomicCounter random_read_counter_; + std::atomic random_read_bytes_counter_; std::atomic random_file_open_counter_; bool count_sequential_reads_; @@ -404,11 +504,43 @@ class SpecialEnv : public EnvWrapper { std::function* table_write_callback_; std::atomic addon_time_; + + std::atomic delete_count_; + + bool time_elapse_only_sleep_; + bool no_sleep_; - std::atomic is_wal_sync_thread_safe_ {true}; + std::atomic is_wal_sync_thread_safe_{true}; }; +#ifndef ROCKSDB_LITE +class OnFileDeletionListener : public EventListener { + public: + OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {} + + void SetExpectedFileName(const std::string file_name) { + expected_file_name_ = file_name; + } + + void VerifyMatchedCount(size_t expected_value) { + ASSERT_EQ(matched_count_, expected_value); + } + + void OnTableFileDeleted(const TableFileDeletionInfo& info) override { + if (expected_file_name_ != "") { + ASSERT_EQ(expected_file_name_, info.file_path); + expected_file_name_ = ""; + matched_count_++; + } + } + + private: + size_t matched_count_; + std::string expected_file_name_; +}; +#endif + class DBTestBase : public testing::Test { protected: // Sequence of option configurations to try @@ -432,19 +564,21 @@ class DBTestBase : public testing::Test { kWalDirAndMmapReads = 16, kManifestFileSize = 17, kPerfOptions = 18, - kDeletesFilterFirst = 19, - kHashSkipList = 20, - kUniversalCompaction = 21, - kUniversalCompactionMultiLevel = 22, - kCompressedBlockCache = 23, - kInfiniteMaxOpenFiles = 24, - kxxHashChecksum = 25, - kFIFOCompaction = 26, - kOptimizeFiltersForHits = 27, - kRowCache = 28, - kLevelSubcompactions = 29, - kUniversalSubcompactions = 30, - kEnd = 29 + kHashSkipList = 19, + kUniversalCompaction = 20, + kUniversalCompactionMultiLevel = 21, + kCompressedBlockCache = 22, + kInfiniteMaxOpenFiles = 23, + kxxHashChecksum = 24, + kFIFOCompaction = 25, + kOptimizeFiltersForHits = 26, + kRowCache = 27, + kRecycleLogFiles = 28, + kConcurrentSkipList = 29, + kEnd = 30, + kLevelSubcompactions = 31, + kUniversalSubcompactions = 32, + kBlockBasedTableWithIndexRestartInterval = 33, }; int option_config_; @@ -490,6 +624,8 @@ class DBTestBase : public testing::Test { return std::string(buf); } + static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip); + // Switch to a fresh database with the next option configuration to // test. Return false if there are no more configurations to test. bool ChangeOptions(int skip_mask = kNoSkip); @@ -509,9 +645,7 @@ class DBTestBase : public testing::Test { const Options& defaultOptions, const anon::OptionsOverride& options_override = anon::OptionsOverride()); - DBImpl* dbfull() { - return reinterpret_cast(db_); - } + DBImpl* dbfull() { return reinterpret_cast(db_); } void CreateColumnFamilies(const std::vector& cfs, const Options& options); @@ -525,9 +659,8 @@ class DBTestBase : public testing::Test { void ReopenWithColumnFamilies(const std::vector& cfs, const Options& options); - Status TryReopenWithColumnFamilies( - const std::vector& cfs, - const std::vector& options); + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); Status TryReopenWithColumnFamilies(const std::vector& cfs, const Options& options); @@ -574,15 +707,21 @@ class DBTestBase : public testing::Test { std::string AllEntriesFor(const Slice& user_key, int cf = 0); +#ifndef ROCKSDB_LITE int NumSortedRuns(int cf = 0); uint64_t TotalSize(int cf = 0); - int NumTableFilesAtLevel(int level, int cf = 0); - uint64_t SizeAtLevel(int level); - int TotalLiveFiles(int cf = 0); + size_t TotalLiveFiles(int cf = 0); + + size_t CountLiveFiles(); +#endif // ROCKSDB_LITE + + int NumTableFilesAtLevel(int level, int cf = 0); + + double CompressionRatioAtLevel(int level, int cf = 0); int TotalTableFiles(int cf = 0, int levels = -1); @@ -591,8 +730,6 @@ class DBTestBase : public testing::Test { size_t CountFiles(); - size_t CountLiveFiles(); - uint64_t Size(const Slice& start, const Slice& limit, int cf = 0); void Compact(int cf, const Slice& start, const Slice& limit, @@ -627,6 +764,9 @@ class DBTestBase : public testing::Test { void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false); + static const int kNumKeysByGenerateNewRandomFile; + static const int KNumKeysByGenerateNewFile = 100; + void GenerateNewRandomFile(Random* rnd, bool nowait = false); std::string IterStatus(Iterator* iter); @@ -643,27 +783,46 @@ class DBTestBase : public testing::Test { // sets newValue with delta // If previous value is not empty, // updates previous value with 'b' string of previous value size - 1. - static UpdateStatus updateInPlaceSmallerSize( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue); + static UpdateStatus updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); - static UpdateStatus updateInPlaceSmallerVarintSize( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue); + static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue); - static UpdateStatus updateInPlaceLargerSize( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue); + static UpdateStatus updateInPlaceLargerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); - static UpdateStatus updateInPlaceNoAction( - char* prevValue, uint32_t* prevSize, - Slice delta, std::string* newValue); + static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue); // Utility method to test InplaceUpdate void validateNumberOfEntries(int numValues, int cf = 0); void CopyFile(const std::string& source, const std::string& destination, uint64_t size = 0); + + std::unordered_map GetAllSSTFiles( + uint64_t* total_size = nullptr); + + std::vector ListTableFiles(Env* env, const std::string& path); + + void VerifyDBFromMap(std::map true_data); + +#ifndef ROCKSDB_LITE + Status GenerateAndAddExternalFile(const Options options, + std::vector keys, size_t file_id); + + uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name); +#endif // ROCKSDB_LITE + + uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); + } }; } // namespace rocksdb diff --git a/external/rocksdb/db/db_universal_compaction_test.cc b/external/rocksdb/db/db_universal_compaction_test.cc index 8e18699c25..7eab802d41 100644 --- a/external/rocksdb/db/db_universal_compaction_test.cc +++ b/external/rocksdb/db/db_universal_compaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,9 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) #include "util/sync_point.h" namespace rocksdb { @@ -22,12 +22,16 @@ static std::string CompressibleString(Random* rnd, int len) { class DBTestUniversalCompactionBase : public DBTestBase, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface> { public: explicit DBTestUniversalCompactionBase( const std::string& path) : DBTestBase(path) {} - virtual void SetUp() override { num_levels_ = GetParam(); } + virtual void SetUp() override { + num_levels_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } int num_levels_; + bool exclusive_manual_compaction_; }; class DBTestUniversalCompaction : public DBTestUniversalCompactionBase { @@ -112,6 +116,111 @@ class DelayFilterFactory : public CompactionFilterFactory { }; } // namespace +// Make sure we don't trigger a problem if the trigger conditon is given +// to be 0, which is invalid. +TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) { + Options options = CurrentOptions(); + + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + // Config universal compaction to always compact to one single sorted run. + options.level0_file_num_compaction_trigger = 0; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.min_merge_width = 2; + options.compaction_options_universal.max_size_amplification_percent = 0; + + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + DestroyAndReopen(options); + ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + + for (int num = 0; num < 16; num++) { + // Write 100KB file. And immediately it should be compacted to one file. + GenerateNewFile(&rnd, &key_idx); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumSortedRuns(0), 1); + } + ASSERT_OK(Put(Key(key_idx), "")); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumSortedRuns(0), 1); +} + +TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.num_levels = num_levels_; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.memtable_factory.reset(new SpecialSkipListFactory(3)); + + DestroyAndReopen(options); + + // block compaction from happening + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + Put(Key(num * 10), "val"); + if (num) { + dbfull()->TEST_WaitForFlushMemTable(); + } + Put(Key(30 + num * 10), "val"); + Put(Key(60 + num * 10), "val"); + } + Put("", ""); + dbfull()->TEST_WaitForFlushMemTable(); + + // Query set of non existing keys + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + + // Make sure bloom filter is used at least once. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Make sure bloom filter is used for all but the last L0 file when looking + // up a non-existent key that's in the range of all L0 files. + ASSERT_EQ(Get(Key(35)), "NOT_FOUND"); + ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1, + TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); + prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Unblock compaction and wait it for happening. + sleeping_task_low.WakeUp(); + dbfull()->TEST_WaitForCompact(); + + // The same queries will not trigger bloom filter + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); +} + // TODO(kailiu) The tests on UniversalCompaction has some issues: // 1. A lot of magic numbers ("11" or "12"). // 2. Made assumption on the memtable flush conditions, which may change from @@ -119,6 +228,7 @@ class DelayFilterFactory : public CompactionFilterFactory { TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) { Options options; options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; options.num_levels = num_levels_; options.write_buffer_size = 105 << 10; // 105KB options.arena_block_size = 4 << 10; @@ -223,13 +333,12 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) { } TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.num_levels = num_levels_; options.write_buffer_size = 100 << 10; // 100KB options.target_file_size_base = 32 << 10; // 32KB options.level0_file_num_compaction_trigger = 3; - options = CurrentOptions(options); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -274,12 +383,12 @@ TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { ChangeCompactOptions(); Options options; options.create_if_missing = true; - options.write_buffer_size = kEntrySize * kEntriesPerBuffer; options.compaction_style = kCompactionStyleLevel; options.num_levels = 1; options.target_file_size_base = options.write_buffer_size; options.compression = kNoCompression; options = CurrentOptions(options); + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); Random rnd(301); @@ -337,12 +446,11 @@ TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { } TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 100 << 10; // 100KB options.num_levels = 7; options.disable_auto_compactions = true; - options = CurrentOptions(options); DestroyAndReopen(options); // Generate 3 overlapping files @@ -367,6 +475,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 4; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; db_->CompactRange(compact_options, nullptr, nullptr); ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); } @@ -381,14 +490,13 @@ class DBTestUniversalCompactionMultiLevels }; TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.num_levels = num_levels_; options.write_buffer_size = 100 << 10; // 100KB options.level0_file_num_compaction_trigger = 8; options.max_background_compactions = 3; options.target_file_size_base = 32 * 1024; - options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // Trigger compaction if size amplification exceeds 110% @@ -424,7 +532,7 @@ TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) { }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.compaction_options_universal.allow_trivial_move = true; options.num_levels = 3; @@ -432,7 +540,6 @@ TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) { options.level0_file_num_compaction_trigger = 3; options.max_background_compactions = 2; options.target_file_size_base = 32 * 1024; - options = CurrentOptions(options); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -459,7 +566,8 @@ TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) { INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionMultiLevels, DBTestUniversalCompactionMultiLevels, - ::testing::Values(3, 20)); + ::testing::Combine(::testing::Values(3, 20), + ::testing::Bool())); class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase { @@ -470,7 +578,7 @@ class DBTestUniversalCompactionParallel : }; TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.num_levels = num_levels_; options.write_buffer_size = 1 << 10; // 1KB @@ -479,7 +587,6 @@ TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) { options.max_background_flushes = 3; options.target_file_size_base = 1 * 1024; options.compaction_options_universal.max_size_amplification_percent = 110; - options = CurrentOptions(options); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -532,10 +639,11 @@ TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) { INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionParallel, DBTestUniversalCompactionParallel, - ::testing::Values(1, 10)); + ::testing::Combine(::testing::Values(1, 10), + ::testing::Bool())); TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 105 << 10; // 105KB options.arena_block_size = 4 << 10; // 4KB @@ -543,7 +651,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { options.level0_file_num_compaction_trigger = 4; options.num_levels = num_levels_; options.compaction_options_universal.compression_size_percent = -1; - options = CurrentOptions(options); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -656,14 +763,13 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) { return; } - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 100 << 10; // 100KB options.target_file_size_base = 32 << 10; // 32KB options.level0_file_num_compaction_trigger = 2; options.num_levels = num_levels_; options.compaction_options_universal.compression_size_percent = 70; - options = CurrentOptions(options); DestroyAndReopen(options); Random rnd(301); @@ -724,14 +830,13 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) { if (!Snappy_Supported()) { return; } - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 100 << 10; // 100KB options.target_file_size_base = 32 << 10; // 32KB options.level0_file_num_compaction_trigger = 2; options.num_levels = num_levels_; options.compaction_options_universal.compression_size_percent = 95; - options = CurrentOptions(options); DestroyAndReopen(options); Random rnd(301); @@ -767,7 +872,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) { }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.compaction_options_universal.allow_trivial_move = true; options.num_levels = 2; @@ -775,7 +880,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) { options.level0_file_num_compaction_trigger = 3; options.max_background_compactions = 1; options.target_file_size_base = 32 * 1024; - options = CurrentOptions(options); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -812,7 +916,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) { rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.compaction_options_universal.allow_trivial_move = true; options.num_levels = 15; @@ -820,7 +924,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) { options.level0_file_num_compaction_trigger = 8; options.max_background_compactions = 4; options.target_file_size_base = 64 * 1024; - options = CurrentOptions(options); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -846,17 +949,19 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) { } TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) { - Options options; + Options options = CurrentOptions(); options.db_paths.emplace_back(dbname_, 300 * 1024); options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleUniversal; - options.write_buffer_size = 110 << 10; // 105KB + options.compaction_options_universal.size_ratio = 5; + options.write_buffer_size = 111 << 10; // 114KB options.arena_block_size = 4 << 10; options.level0_file_num_compaction_trigger = 2; options.num_levels = 1; - options = CurrentOptions(options); std::vector filenames; env_->GetChildren(options.db_paths[1].path, &filenames); @@ -968,29 +1073,27 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { int max_key1 = 200; int max_key2 = 600; int max_key3 = 800; + const int KNumKeysPerFile = 10; // Stage 1: open a DB with universal compaction, num_levels=1 Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.num_levels = 1; - options.write_buffer_size = 100 << 10; // 100KB + options.write_buffer_size = 200 << 10; // 200KB options.level0_file_num_compaction_trigger = 3; + options.memtable_factory.reset(new SpecialSkipListFactory(KNumKeysPerFile)); options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); for (int i = 0; i <= max_key1; i++) { // each value is 10K ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); } ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); - int non_level0_num_files = 0; - for (int i = 1; i < options.num_levels; i++) { - non_level0_num_files += NumTableFilesAtLevel(i, 1); - } - ASSERT_EQ(non_level0_num_files, 0); - // Stage 2: reopen with universal compaction, num_levels=4 options.compaction_style = kCompactionStyleUniversal; options.num_levels = 4; @@ -1003,6 +1106,8 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { for (int i = max_key1 + 1; i <= max_key2; i++) { // each value is 10K ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); } ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); @@ -1019,6 +1124,7 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 0; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); // Need to restart it once to remove higher level records in manifest. ReopenWithColumnFamilies({"default", "pikachu"}, options); @@ -1032,6 +1138,8 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { for (int i = max_key2 + 1; i <= max_key3; i++) { // each value is 10K ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); } ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); @@ -1043,16 +1151,17 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) { if (!Snappy_Supported()) { return; } - Options options; + Options options = CurrentOptions(); options.db_paths.emplace_back(dbname_, 500 * 1024); options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024); options.compaction_style = kCompactionStyleUniversal; - options.write_buffer_size = 110 << 10; // 105KB - options.arena_block_size = 4 * 1024; + options.compaction_options_universal.size_ratio = 5; + options.write_buffer_size = 111 << 10; // 114KB options.arena_block_size = 4 << 10; options.level0_file_num_compaction_trigger = 2; options.num_levels = 1; - options = CurrentOptions(options); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); std::vector filenames; env_->GetChildren(options.db_paths[1].path, &filenames); @@ -1139,7 +1248,8 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) { } INSTANTIATE_TEST_CASE_P(UniversalCompactionNumLevels, DBTestUniversalCompaction, - ::testing::Values(1, 3, 5)); + ::testing::Combine(::testing::Values(1, 3, 5), + ::testing::Bool())); class DBTestUniversalManualCompactionOutputPathId : public DBTestUniversalCompactionBase { @@ -1171,6 +1281,7 @@ TEST_P(DBTestUniversalManualCompactionOutputPathId, // Full compaction to DB path 0 CompactRangeOptions compact_options; compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); @@ -1193,6 +1304,7 @@ TEST_P(DBTestUniversalManualCompactionOutputPathId, // Full compaction to DB path 0 compact_options.target_path_id = 0; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); @@ -1200,20 +1312,22 @@ TEST_P(DBTestUniversalManualCompactionOutputPathId, // Fail when compacting to an invalid path ID compact_options.target_path_id = 2; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) .IsInvalidArgument()); } INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId, DBTestUniversalManualCompactionOutputPathId, - ::testing::Values(1, 8)); + ::testing::Combine(::testing::Values(1, 8), + ::testing::Bool())); } // namespace rocksdb -#endif // !(defined NDEBUG) || !defined(OS_WIN) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/external/rocksdb/db/db_wal_test.cc b/external/rocksdb/db/db_wal_test.cc index 531021ec4c..145bd25fb9 100644 --- a/external/rocksdb/db/db_wal_test.cc +++ b/external/rocksdb/db/db_wal_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,11 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/db_test_util.h" -#if !(defined NDEBUG) || !defined(OS_WIN) +#include "util/options_helper.h" #include "util/sync_point.h" -#endif + +#include namespace rocksdb { class DBWALTest : public DBTestBase { @@ -70,7 +71,6 @@ TEST_F(DBWALTest, RollLog) { } while (ChangeOptions()); } -#if !(defined NDEBUG) || !defined(OS_WIN) TEST_F(DBWALTest, SyncWALNotBlockWrite) { Options options = CurrentOptions(); options.max_write_buffer_number = 4; @@ -130,15 +130,887 @@ TEST_F(DBWALTest, SyncWALNotWaitWrite) { ASSERT_EQ(Get("foo2"), "bar2"); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } + +TEST_F(DBWALTest, Recover) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + } while (ChangeOptions()); +} + +TEST_F(DBWALTest, RecoverWithTableHandle) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Put(1, "bar", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "big", std::string(100, 'a'))); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &files); + size_t total_files = 0; + for (const auto& level : files) { + total_files += level.size(); + } + ASSERT_EQ(total_files, 3); + for (const auto& level : files) { + for (const auto& file : level) { + if (kInfiniteMaxOpenFiles == option_config_) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } else { + ASSERT_TRUE(file.table_reader_handle == nullptr); + } + } + } + } while (ChangeOptions()); +} + +TEST_F(DBWALTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + env_->CreateDirIfMissing(backup_logs); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.wal_dir = dbname_ + "/logs"; + DestroyAndReopen(options); + + // fill up the DB + std::string one, two; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(options.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // this should ignore the log files, recovery should not happen again + // if the recovery happens, the same merge operator would be called twice, + // leading to incorrect results + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + Destroy(options); + Reopen(options); + Close(); + + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // assert that we successfully recovered only from logs, even though we + // destroyed the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + + // Recovery will fail if DB directory doesn't exist. + Destroy(options); + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + env_->DeleteFile(backup_logs + "/" + log); + } + } + Status s = TryReopen(options); + ASSERT_TRUE(!s.ok()); + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST_F(DBWALTest, RecoveryWithEmptyLog) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + } while (ChangeOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, GetSortedWalFiles) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_EQ(0, log_files.size()); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_EQ(1, log_files.size()); + } while (ChangeOptions()); +} + +TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) { + // Test for regression of WAL cleanup missing files that don't contain data + // for every column family. + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + std::array earliest_log_nums; + for (int i = 0; i < 2; ++i) { + if (i > 0) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + if (log_files.size() > 0) { + earliest_log_nums[i] = log_files[0]->LogNumber(); + } else { + earliest_log_nums[i] = port::kMaxUint64; + } + } + // Check at least the first WAL was cleaned up during the recovery. + ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]); + } while (ChangeOptions()); +} + +TEST_F(DBWALTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); + ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); + ASSERT_OK(Put(1, "small3", std::string(10, '3'))); + ASSERT_OK(Put(1, "small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); + ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); + ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); + ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); + } while (ChangeCompactOptions()); +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it was empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { + Options options = CurrentOptions(); + options.write_buffer_size = 5000000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Since we will reopen DB with smaller write_buffer_size, + // each key will go to new SST file + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + + ASSERT_OK(Put(3, Key(10), DummyString(1))); + // Make 'dobrynia' to be flushed and new WAL file to be created + ASSERT_OK(Put(2, Key(10), DummyString(7500000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + // Make sure 'dobrynia' was flushed: check sst files amount + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + } + // New WAL file + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + // No inserts => default is empty + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(5)); + // 1 SST for big key + 1 SST for small one + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + // 1 SST for all keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it wasn't empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmount) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + options.arena_block_size = 4 * 1024; + options.avoid_flush_during_recovery = false; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Make 'nikitich' memtable to be flushed + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // 4 memtable are not flushed, 1 sst file + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + // Memtable for 'nikitich' has flushed, new WAL file has opened + // 4 memtable still not flushed + + // Write to new WAL file + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Fill up 'nikitich' one more time + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + // make it flush + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // There are still 4 memtable not flushed, and 2 sst tables + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + std::vector table_files = ListTableFiles(env_, dbname_); + // Check, that records for 'default', 'dobrynia' and 'pikachu' from + // first, second and third WALs went to the same SST. + // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for + // 'dobrynia', one for 'pikachu' + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + } +} + +TEST_F(DBWALTest, SyncMultipleLogs) { + const uint64_t kNumBatches = 2; + const int kBatchSize = 1000; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 4096; + Reopen(options); + + WriteBatch batch; + WriteOptions wo; + wo.sync = true; + + for (uint64_t b = 0; b < kNumBatches; b++) { + batch.Clear(); + for (int i = 0; i < kBatchSize; i++) { + batch.Put(Key(i), DummyString(128)); + } + + dbfull()->Write(wo, &batch); + } + + ASSERT_OK(dbfull()->SyncWAL()); +} + +// +// Test WAL recovery for the various modes available +// +class RecoveryTestHelper { + public: + // Number of WAL files to generate + static const int kWALFilesCount = 10; + // Starting number for the WAL file name like 00010.log + static const int kWALFileOffset = 10; + // Keys to be written per WAL file + static const int kKeysPerWALFile = 1024; + // Size of the value + static const int kValueSize = 10; + + // Create WAL files with values filled in + static void FillData(DBWALTest* test, const Options& options, + const size_t wal_count, size_t* count) { + const DBOptions& db_options = options; + + *count = 0; + + shared_ptr table_cache = NewLRUCache(50000, 16); + EnvOptions env_options; + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + + unique_ptr versions; + unique_ptr wal_manager; + WriteController write_controller; + + versions.reset(new VersionSet(test->dbname_, &db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller)); + + wal_manager.reset(new WalManager(db_options, env_options)); + + std::unique_ptr current_log_writer; + + for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { + uint64_t current_log_number = j; + std::string fname = LogFileName(test->dbname_, current_log_number); + unique_ptr file; + ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + current_log_writer.reset( + new log::Writer(std::move(file_writer), current_log_number, + db_options.recycle_log_file_num > 0)); + + for (int i = 0; i < kKeysPerWALFile; i++) { + std::string key = "key" + ToString((*count)++); + std::string value = test->DummyString(kValueSize); + assert(current_log_writer.get() != nullptr); + uint64_t seq = versions->LastSequence() + 1; + WriteBatch batch; + batch.Put(key, value); + WriteBatchInternal::SetSequence(&batch, seq); + current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); + versions->SetLastSequence(seq); + } + } + } + + // Recreate and fill the store with some data + static size_t FillData(DBWALTest* test, Options* options) { + options->create_if_missing = true; + test->DestroyAndReopen(*options); + test->Close(); + + size_t count = 0; + FillData(test, *options, kWALFilesCount, &count); + return count; + } + + // Read back all the keys we wrote and return the number of keys found + static size_t GetData(DBWALTest* test) { + size_t count = 0; + for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { + if (test->Get("key" + ToString(i)) != "NOT_FOUND") { + ++count; + } + } + return count; + } + + // Manuall corrupt the specified WAL + static void CorruptWAL(DBWALTest* test, const Options& options, + const double off, const double len, + const int wal_file_id, const bool trunc = false) { + Env* env = options.env; + std::string fname = LogFileName(test->dbname_, wal_file_id); + uint64_t size; + ASSERT_OK(env->GetFileSize(fname, &size)); + ASSERT_GT(size, 0); +#ifdef OS_WIN + // Windows disk cache behaves differently. When we truncate + // the original content is still in the cache due to the original + // handle is still open. Generally, in Windows, one prohibits + // shared access to files and it is not needed for WAL but we allow + // it to induce corruption at various tests. + test->Close(); #endif + if (trunc) { + ASSERT_EQ(0, truncate(fname.c_str(), static_cast(size * off))); + } else { + InduceCorruption(fname, static_cast(size * off), + static_cast(size * len)); + } + } + + // Overwrite data with 'a' from offset for length len + static void InduceCorruption(const std::string& filename, size_t offset, + size_t len) { + ASSERT_GT(len, 0U); + + int fd = open(filename.c_str(), O_RDWR); + + // On windows long is 32-bit + ASSERT_LE(offset, std::numeric_limits::max()); + + ASSERT_GT(fd, 0); + ASSERT_EQ(offset, lseek(fd, static_cast(offset), SEEK_SET)); + + void* buf = alloca(len); + memset(buf, 'a', len); + ASSERT_EQ(len, write(fd, buf, static_cast(len))); + + close(fd); + } +}; + +// Test scope: +// - We expect to open the data store when there is incomplete trailing writes +// at the end of any of the logs +// - We do not expect to open the data store for corruption +TEST_F(DBWALTest, kTolerateCorruptedTailRecords) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset position */ + for (int j = jstart; j < jend; j++) { /* WAL file */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + // test checksum failure or parsing + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, /*wal=*/j, trunc); + + if (trunc) { + options.wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + const size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_TRUE(i == 0 || recovered_row_count > 0); + ASSERT_LT(recovered_row_count, row_count); + } else { + options.wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords; + ASSERT_NOK(TryReopen(options)); + } + } + } + } +} + +// Test scope: +// We don't expect the data store to be opened if there is any corruption +// (leading, middle or trailing -- incomplete writes or corruption) +TEST_F(DBWALTest, kAbsoluteConsistency) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + // Verify clean slate behavior + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset position */ + if (trunc && i == 0) { + continue; + } + + for (int j = jstart; j < jend; j++) { /* wal files */ + // fill with new date + RecoveryTestHelper::FillData(this, &options); + // corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + // verify + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_NOK(TryReopen(options)); + } + } + } +} + +// Test scope: +// - We expect to open data store under all circumstances +// - We expect only data upto the point where the first error was encountered +TEST_F(DBWALTest, kPointInTimeRecovery) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + const int maxkeys = + RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Offset of corruption */ + for (int j = jstart; j < jend; j++) { /* WAL file */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + + // Verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + bool expect_data = true; + for (size_t k = 0; k < maxkeys; ++k) { + bool found = Get("key" + ToString(i)) != "NOT_FOUND"; + if (expect_data && !found) { + expect_data = false; + } + ASSERT_EQ(found, expect_data); + } + + const size_t min = RecoveryTestHelper::kKeysPerWALFile * + (j - RecoveryTestHelper::kWALFileOffset); + ASSERT_GE(recovered_row_count, min); + if (!trunc && i != 0) { + const size_t max = RecoveryTestHelper::kKeysPerWALFile * + (j - RecoveryTestHelper::kWALFileOffset + 1); + ASSERT_LE(recovered_row_count, max); + } + } + } + } +} + +// Test scope: +// - We expect to open the data store under all scenarios +// - We expect to have recovered records past the corruption zone +TEST_F(DBWALTest, kSkipAnyCorruptedRecords) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset */ + for (int j = jstart; j < jend; j++) { /* wal files */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the WAL + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + + // Verify behavior + options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + if (!trunc) { + ASSERT_TRUE(i != 0 || recovered_row_count > 0); + } + } + } + } +} + +TEST_F(DBWALTest, AvoidFlushDuringRecovery) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = false; + + // Test with flush after recovery. + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Put("bar", "v4")); + ASSERT_EQ(1, TotalTableFiles()); + // Reopen DB. Check if WAL logs flushed. + Reopen(options); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v4", Get("bar")); + ASSERT_EQ(2, TotalTableFiles()); + + // Test without flush after recovery. + options.avoid_flush_during_recovery = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v5")); + ASSERT_OK(Put("bar", "v6")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v7")); + ASSERT_OK(Put("bar", "v8")); + ASSERT_EQ(1, TotalTableFiles()); + // Reopen DB. WAL logs should not be flushed this time. + Reopen(options); + ASSERT_EQ("v7", Get("foo")); + ASSERT_EQ("v8", Get("bar")); + ASSERT_EQ(1, TotalTableFiles()); + + // Force flush with allow_2pc. + options.avoid_flush_during_recovery = true; + options.allow_2pc = true; + ASSERT_OK(Put("foo", "v9")); + ASSERT_OK(Put("bar", "v10")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v11")); + ASSERT_OK(Put("bar", "v12")); + Reopen(options); + ASSERT_EQ("v11", Get("foo")); + ASSERT_EQ("v12", Get("bar")); + ASSERT_EQ(2, TotalTableFiles()); +} + +TEST_F(DBWALTest, RecoverWithoutFlush) { + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + options.write_buffer_size = 64 * 1024 * 1024; + + size_t count = RecoveryTestHelper::FillData(this, &options); + auto validateData = [this, count]() { + for (size_t i = 0; i < count; i++) { + ASSERT_NE(Get("key" + ToString(i)), "NOT_FOUND"); + } + }; + Reopen(options); + validateData(); + // Insert some data without flush + ASSERT_OK(Put("foo", "foo_v1")); + ASSERT_OK(Put("bar", "bar_v1")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v1"); + ASSERT_EQ(Get("bar"), "bar_v1"); + // Insert again and reopen + ASSERT_OK(Put("foo", "foo_v2")); + ASSERT_OK(Put("bar", "bar_v2")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v2"); + ASSERT_EQ(Get("bar"), "bar_v2"); + // manual flush and insert again + Flush(); + ASSERT_EQ(Get("foo"), "foo_v2"); + ASSERT_EQ(Get("bar"), "bar_v2"); + ASSERT_OK(Put("foo", "foo_v3")); + ASSERT_OK(Put("bar", "bar_v3")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v3"); + ASSERT_EQ(Get("bar"), "bar_v3"); +} + +TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) { + const std::string kSmallValue = "v"; + const std::string kLargeValue = DummyString(1024); + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + + auto countWalFiles = [this]() { + VectorLogPtr log_files; + dbfull()->GetSortedWalFiles(log_files); + return log_files.size(); + }; + + // Create DB with multiple column families and multiple log files. + CreateAndReopenWithCF({"one", "two"}, options); + ASSERT_OK(Put(0, "key1", kSmallValue)); + ASSERT_OK(Put(1, "key2", kLargeValue)); + Flush(1); + ASSERT_EQ(1, countWalFiles()); + ASSERT_OK(Put(0, "key3", kSmallValue)); + ASSERT_OK(Put(2, "key4", kLargeValue)); + Flush(2); + ASSERT_EQ(2, countWalFiles()); + + // Reopen, insert and flush. + options.db_write_buffer_size = 64 * 1024 * 1024; + ReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_EQ(Get(0, "key1"), kSmallValue); + ASSERT_EQ(Get(1, "key2"), kLargeValue); + ASSERT_EQ(Get(0, "key3"), kSmallValue); + ASSERT_EQ(Get(2, "key4"), kLargeValue); + // Insert more data. + ASSERT_OK(Put(0, "key5", kLargeValue)); + ASSERT_OK(Put(1, "key6", kLargeValue)); + ASSERT_EQ(3, countWalFiles()); + Flush(1); + ASSERT_OK(Put(2, "key7", kLargeValue)); + ASSERT_EQ(4, countWalFiles()); + + // Reopen twice and validate. + for (int i = 0; i < 2; i++) { + ReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_EQ(Get(0, "key1"), kSmallValue); + ASSERT_EQ(Get(1, "key2"), kLargeValue); + ASSERT_EQ(Get(0, "key3"), kSmallValue); + ASSERT_EQ(Get(2, "key4"), kLargeValue); + ASSERT_EQ(Get(0, "key5"), kLargeValue); + ASSERT_EQ(Get(1, "key6"), kLargeValue); + ASSERT_EQ(Get(2, "key7"), kLargeValue); + ASSERT_EQ(4, countWalFiles()); + } +} + +// In this test we are trying to do the following: +// 1. Create a DB with corrupted WAL log; +// 2. Open with avoid_flush_during_recovery = true; +// 3. Append more data without flushing, which creates new WAL log. +// 4. Open again. See if it can correctly handle previous corruption. +TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + const int kAppendKeys = 100; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + options.write_buffer_size = 64 * 1024 * 1024; + + auto getAll = [this]() { + std::vector> data; + ReadOptions ropt; + Iterator* iter = dbfull()->NewIterator(ropt); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + data.push_back( + std::make_pair(iter->key().ToString(), iter->value().ToString())); + } + delete iter; + return data; + }; + for (auto& mode : wal_recovery_mode_string_map) { + options.wal_recovery_mode = mode.second; + for (auto trunc : {true, false}) { + for (int i = 0; i < 4; i++) { + for (int j = jstart; j < jend; j++) { + // Create corrupted WAL + RecoveryTestHelper::FillData(this, &options); + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, /*wal=*/j, trunc); + // Skip the test if DB won't open. + if (!TryReopen(options).ok()) { + ASSERT_TRUE(options.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency || + (!trunc && + options.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)); + continue; + } + ASSERT_OK(TryReopen(options)); + // Append some more data. + for (int k = 0; k < kAppendKeys; k++) { + std::string key = "extra_key" + ToString(k); + std::string value = DummyString(RecoveryTestHelper::kValueSize); + ASSERT_OK(Put(key, value)); + } + // Save data for comparison. + auto data = getAll(); + // Reopen. Verify data. + ASSERT_OK(TryReopen(options)); + auto actual_data = getAll(); + ASSERT_EQ(data, actual_data); + } + } + } + } +} + +#endif // ROCKSDB_LITE + } // namespace rocksdb int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/external/rocksdb/db/dbformat.cc b/external/rocksdb/db/dbformat.cc index eb19a7b175..d840aea86b 100644 --- a/external/rocksdb/db/dbformat.cc +++ b/external/rocksdb/db/dbformat.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/dbformat.h b/external/rocksdb/db/dbformat.h index 2f5d59e606..5bd154a7a9 100644 --- a/external/rocksdb/db/dbformat.h +++ b/external/rocksdb/db/dbformat.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -39,6 +39,11 @@ enum ValueType : unsigned char { kTypeColumnFamilyMerge = 0x6, // WAL only. kTypeSingleDeletion = 0x7, kTypeColumnFamilySingleDeletion = 0x8, // WAL only. + kTypeBeginPrepareXID = 0x9, // WAL only. + kTypeEndPrepareXID = 0xA, // WAL only. + kTypeCommitXID = 0xB, // WAL only. + kTypeRollbackXID = 0xC, // WAL only. + kTypeNoop = 0xD, // WAL only. kMaxValue = 0x7F // Not used for storing records. }; @@ -174,6 +179,10 @@ class InternalKey { Slice user_key() const { return ExtractUserKey(rep_); } size_t size() { return rep_.size(); } + void Set(const Slice& _user_key, SequenceNumber s, ValueType t) { + SetFrom(ParsedInternalKey(_user_key, s, t)); + } + void SetFrom(const ParsedInternalKey& p) { rep_.clear(); AppendInternalKey(&rep_, p); @@ -271,7 +280,8 @@ inline LookupKey::~LookupKey() { class IterKey { public: - IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {} + IterKey() + : buf_(space_), buf_size_(sizeof(space_)), key_(buf_), key_size_(0) {} ~IterKey() { ResetBuffer(); } @@ -293,31 +303,41 @@ class IterKey { void TrimAppend(const size_t shared_len, const char* non_shared_data, const size_t non_shared_len) { assert(shared_len <= key_size_); - size_t total_size = shared_len + non_shared_len; - if (total_size <= buf_size_) { - key_size_ = total_size; - } else { + + if (IsKeyPinned() /* key is not in buf_ */) { + // Copy the key from external memory to buf_ (copy shared_len bytes) + EnlargeBufferIfNeeded(total_size); + memcpy(buf_, key_, shared_len); + } else if (total_size > buf_size_) { // Need to allocate space, delete previous space char* p = new char[total_size]; memcpy(p, key_, shared_len); - if (key_ != space_) { - delete[] key_; + if (buf_ != space_) { + delete[] buf_; } - key_ = p; - key_size_ = total_size; + buf_ = p; buf_size_ = total_size; } - memcpy(key_ + shared_len, non_shared_data, non_shared_len); + memcpy(buf_ + shared_len, non_shared_data, non_shared_len); + key_ = buf_; + key_size_ = total_size; } - Slice SetKey(const Slice& key) { + Slice SetKey(const Slice& key, bool copy = true) { size_t size = key.size(); - EnlargeBufferIfNeeded(size); - memcpy(key_, key.data(), size); + if (copy) { + // Copy key to buf_ + EnlargeBufferIfNeeded(size); + memcpy(buf_, key.data(), size); + key_ = buf_; + } else { + // Update key_ to point to external memory + key_ = key.data(); + } key_size_ = size; return Slice(key_, key_size_); } @@ -335,11 +355,14 @@ class IterKey { // Update the sequence number in the internal key. Guarantees not to // invalidate slices to the key (and the user key). void UpdateInternalKey(uint64_t seq, ValueType t) { + assert(!IsKeyPinned()); assert(key_size_ >= 8); uint64_t newval = (seq << 8) | t; - EncodeFixed64(&key_[key_size_ - 8], newval); + EncodeFixed64(&buf_[key_size_ - 8], newval); } + bool IsKeyPinned() const { return (key_ != buf_); } + void SetInternalKey(const Slice& key_prefix, const Slice& user_key, SequenceNumber s, ValueType value_type = kValueTypeForSeek) { @@ -347,10 +370,12 @@ class IterKey { size_t usize = user_key.size(); EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t)); if (psize > 0) { - memcpy(key_, key_prefix.data(), psize); + memcpy(buf_, key_prefix.data(), psize); } - memcpy(key_ + psize, user_key.data(), usize); - EncodeFixed64(key_ + usize + psize, PackSequenceAndType(s, value_type)); + memcpy(buf_ + psize, user_key.data(), usize); + EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type)); + + key_ = buf_; key_size_ = psize + usize + sizeof(uint64_t); } @@ -377,20 +402,22 @@ class IterKey { void EncodeLengthPrefixedKey(const Slice& key) { auto size = key.size(); EnlargeBufferIfNeeded(size + static_cast(VarintLength(size))); - char* ptr = EncodeVarint32(key_, static_cast(size)); + char* ptr = EncodeVarint32(buf_, static_cast(size)); memcpy(ptr, key.data(), size); + key_ = buf_; } private: - char* key_; + char* buf_; size_t buf_size_; + const char* key_; size_t key_size_; char space_[32]; // Avoid allocation for short keys void ResetBuffer() { - if (key_ != space_) { - delete[] key_; - key_ = space_; + if (buf_ != space_) { + delete[] buf_; + buf_ = space_; } buf_size_ = sizeof(space_); key_size_ = 0; @@ -407,7 +434,7 @@ class IterKey { if (key_size > buf_size_) { // Need to enlarge the buffer. ResetBuffer(); - key_ = new char[key_size]; + buf_ = new char[key_size]; buf_size_ = key_size; } } @@ -447,6 +474,12 @@ class InternalKeySliceTransform : public SliceTransform { const SliceTransform* const transform_; }; +// Read the key of a record from a write batch. +// if this record represent the default column family then cf_record +// must be passed as false, otherwise it must be passed as true. +extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, + bool cf_record); + // Read record from a write batch piece from input. // tag, column_family, key, value and blob are return values. Callers own the // Slice they point to. @@ -454,5 +487,5 @@ class InternalKeySliceTransform : public SliceTransform { // input will be advanced to after the record. extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, uint32_t* column_family, Slice* key, - Slice* value, Slice* blob); + Slice* value, Slice* blob, Slice* xid); } // namespace rocksdb diff --git a/external/rocksdb/db/dbformat_test.cc b/external/rocksdb/db/dbformat_test.cc index 0273dd0625..b431690cc9 100644 --- a/external/rocksdb/db/dbformat_test.cc +++ b/external/rocksdb/db/dbformat_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -92,6 +92,30 @@ TEST_F(FormatTest, InternalKeyShortSeparator) { Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue))); + ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("ABC1AAAAA", 100, kTypeValue), + IKey("ABC2ABB", 200, kTypeValue))); + + ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), + IKey("AAA2AA", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); + + ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), + IKey("AAA2A", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA1", 100, kTypeValue), + Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); + // When start user key is prefix of limit user key ASSERT_EQ(IKey("foo", 100, kTypeValue), Shorten(IKey("foo", 100, kTypeValue), diff --git a/external/rocksdb/db/deletefile_test.cc b/external/rocksdb/db/deletefile_test.cc index b4ddad5e23..c8257ccd81 100644 --- a/external/rocksdb/db/deletefile_test.cc +++ b/external/rocksdb/db/deletefile_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,20 +9,21 @@ #ifndef ROCKSDB_LITE -#include "rocksdb/db.h" +#include +#include +#include +#include #include "db/db_impl.h" #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" #include "util/string_util.h" +#include "util/sync_point.h" #include "util/testharness.h" #include "util/testutil.h" -#include "rocksdb/env.h" -#include "rocksdb/transaction_log.h" -#include -#include -#include -#include namespace rocksdb { @@ -37,6 +38,7 @@ class DeleteFileTest : public testing::Test { DeleteFileTest() { db_ = nullptr; env_ = Env::Default(); + options_.delete_obsolete_files_period_micros = 0; // always do full purge options_.enable_thread_tracking = true; options_.write_buffer_size = 1024*1024*1000; options_.target_file_size_base = 1024*1024*1000; @@ -74,6 +76,7 @@ class DeleteFileTest : public testing::Test { void CloseDB() { delete db_; + db_ = nullptr; } void AddKeys(int numkeys, int startkey = 0) { @@ -150,6 +153,15 @@ class DeleteFileTest : public testing::Test { ASSERT_EQ(required_manifest, manifest_cnt); } + static void DoSleep(void* arg) { + auto test = reinterpret_cast(arg); + test->env_->SleepForMicroseconds(2 * 1000 * 1000); + } + + // An empty job to guard all jobs are processed + static void GuardFinish(void* arg) { + TEST_SYNC_POINT("DeleteFileTest::GuardFinish"); + } }; TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { @@ -229,6 +241,118 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { CloseDB(); } +TEST_F(DeleteFileTest, BackgroundPurgeTest) { + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + Iterator* itr = 0; + CreateTwoLevels(); + ReadOptions options; + options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(options); + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + test::SleepingBackgroundTask sleeping_task_before; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_before, Env::Priority::HIGH); + delete itr; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + + // Make sure no purges are executed foreground + CheckFileTypeCounts(dbname_, 0, 3, 1); + sleeping_task_before.WakeUp(); + sleeping_task_before.WaitUntilDone(); + + // Make sure all background purges are executed + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); + + CloseDB(); +} + +// This test is to reproduce a bug that read invalid ReadOption in iterator +// cleanup function +TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + Iterator* itr = 0; + CreateTwoLevels(); + ReadOptions* options = new ReadOptions(); + options->background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(*options); + // ReadOptions is deleted, but iterator cleanup function should not be + // affected + delete options; + + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + + // Make sure all background purges are executed + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); + + CloseDB(); +} + +TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + CreateTwoLevels(); + ReadOptions options; + options.background_purge_on_iterator_cleanup = true; + Iterator* itr1 = db_->NewIterator(options); + CreateTwoLevels(); + Iterator* itr2 = db_->NewIterator(options); + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 5 sst files after 2 compactions with 2 live iterators + CheckFileTypeCounts(dbname_, 0, 5, 1); + + // ~DBImpl should wait until all BGWorkPurge are finished + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"}, + {"DeleteFileTest::GuardFinish", + "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + delete itr1; + env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH); + delete itr2; + env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH); + CloseDB(); + + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DeleteFileTest, DeleteFileWithIterator) { CreateTwoLevels(); ReadOptions options; diff --git a/external/rocksdb/db/event_helpers.cc b/external/rocksdb/db/event_helpers.cc index 9035c0c4ba..9249837c2b 100644 --- a/external/rocksdb/db/event_helpers.cc +++ b/external/rocksdb/db/event_helpers.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,7 +8,10 @@ namespace rocksdb { namespace { -inline double SafeDivide(double a, double b) { return b == 0.0 ? 0 : a / b; } +template +inline T SafeDivide(T a, T b) { + return b == 0 ? 0 : a / b; +} } // namespace void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { @@ -17,57 +20,83 @@ void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { std::chrono::system_clock::now().time_since_epoch()).count(); } -void EventHelpers::LogAndNotifyTableFileCreation( +#ifndef ROCKSDB_LITE +void EventHelpers::NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason) { + TableFileCreationBriefInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.job_id = job_id; + info.reason = reason; + for (auto& listener : listeners) { + listener->OnTableFileCreationStarted(info); + } +} +#endif // !ROCKSDB_LITE + +void EventHelpers::LogAndNotifyTableFileCreationFinished( EventLogger* event_logger, const std::vector>& listeners, - const FileDescriptor& fd, const TableFileCreationInfo& info) { - assert(event_logger); - JSONWriter jwriter; - AppendCurrentTime(&jwriter); - jwriter << "cf_name" << info.cf_name - << "job" << info.job_id - << "event" << "table_file_creation" - << "file_number" << fd.GetNumber() - << "file_size" << fd.GetFileSize(); - - // table_properties - { - jwriter << "table_properties"; - jwriter.StartObject(); - - // basic properties: - jwriter << "data_size" << info.table_properties.data_size - << "index_size" << info.table_properties.index_size - << "filter_size" << info.table_properties.filter_size - << "raw_key_size" << info.table_properties.raw_key_size - << "raw_average_key_size" << SafeDivide( - info.table_properties.raw_key_size, - info.table_properties.num_entries) - << "raw_value_size" << info.table_properties.raw_value_size - << "raw_average_value_size" << SafeDivide( - info.table_properties.raw_value_size, - info.table_properties.num_entries) - << "num_data_blocks" << info.table_properties.num_data_blocks - << "num_entries" << info.table_properties.num_entries - << "filter_policy_name" << - info.table_properties.filter_policy_name; - - // user collected properties - for (const auto& prop : info.table_properties.user_collected_properties) { - jwriter << prop.first << prop.second; + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + const TableProperties& table_properties, TableFileCreationReason reason, + const Status& s) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "table_file_creation" + << "file_number" << fd.GetNumber() << "file_size" + << fd.GetFileSize(); + + // table_properties + { + jwriter << "table_properties"; + jwriter.StartObject(); + + // basic properties: + jwriter << "data_size" << table_properties.data_size << "index_size" + << table_properties.index_size << "filter_size" + << table_properties.filter_size << "raw_key_size" + << table_properties.raw_key_size << "raw_average_key_size" + << SafeDivide(table_properties.raw_key_size, + table_properties.num_entries) + << "raw_value_size" << table_properties.raw_value_size + << "raw_average_value_size" + << SafeDivide(table_properties.raw_value_size, + table_properties.num_entries) + << "num_data_blocks" << table_properties.num_data_blocks + << "num_entries" << table_properties.num_entries + << "filter_policy_name" << table_properties.filter_policy_name; + + // user collected properties + for (const auto& prop : table_properties.readable_properties) { + jwriter << prop.first << prop.second; + } + jwriter.EndObject(); } jwriter.EndObject(); - } - jwriter.EndObject(); - event_logger->Log(jwriter); + event_logger->Log(jwriter); + } #ifndef ROCKSDB_LITE if (listeners.size() == 0) { return; } - - for (auto listener : listeners) { + TableFileCreationInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.file_size = fd.file_size; + info.job_id = job_id; + info.table_properties = table_properties; + info.reason = reason; + info.status = s; + for (auto& listener : listeners) { listener->OnTableFileCreated(info); } #endif // !ROCKSDB_LITE diff --git a/external/rocksdb/db/event_helpers.h b/external/rocksdb/db/event_helpers.h index a60bc9a9e8..e9c111f20f 100644 --- a/external/rocksdb/db/event_helpers.h +++ b/external/rocksdb/db/event_helpers.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,15 +19,30 @@ namespace rocksdb { class EventHelpers { public: static void AppendCurrentTime(JSONWriter* json_writer); - static void LogAndNotifyTableFileCreation( +#ifndef ROCKSDB_LITE + static void NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason); +#endif // !ROCKSDB_LITE + static void LogAndNotifyTableFileCreationFinished( EventLogger* event_logger, const std::vector>& listeners, - const FileDescriptor& fd, const TableFileCreationInfo& info); + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + const TableProperties& table_properties, TableFileCreationReason reason, + const Status& s); static void LogAndNotifyTableFileDeletion( EventLogger* event_logger, int job_id, uint64_t file_number, const std::string& file_path, const Status& status, const std::string& db_name, const std::vector>& listeners); + + private: + static void LogAndNotifyTableFileCreation( + EventLogger* event_logger, + const std::vector>& listeners, + const FileDescriptor& fd, const TableFileCreationInfo& info); }; } // namespace rocksdb diff --git a/external/rocksdb/db/experimental.cc b/external/rocksdb/db/experimental.cc index 0b5018aef1..26b2113d2a 100644 --- a/external/rocksdb/db/experimental.cc +++ b/external/rocksdb/db/experimental.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/fault_injection_test.cc b/external/rocksdb/db/fault_injection_test.cc index 84a6e9a524..0883de77b1 100644 --- a/external/rocksdb/db/fault_injection_test.cc +++ b/external/rocksdb/db/fault_injection_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,10 +11,6 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". -#if !(defined NDEBUG) || !defined(OS_WIN) - -#include -#include #include "db/db_impl.h" #include "db/filename.h" #include "db/log_format.h" @@ -24,6 +20,7 @@ #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" +#include "util/fault_injection_test_env.h" #include "util/logging.h" #include "util/mock_env.h" #include "util/mutexlock.h" @@ -37,401 +34,6 @@ static const int kValueSize = 1000; static const int kMaxNumValues = 2000; static const size_t kNumIterations = 3; -class TestWritableFile; -class FaultInjectionTestEnv; - -namespace { - -// Assume a filename, and not a directory name like "/foo/bar/" -static std::string GetDirName(const std::string filename) { - size_t found = filename.find_last_of("/\\"); - if (found == std::string::npos) { - return ""; - } else { - return filename.substr(0, found); - } -} - -// Trim the tailing "/" in the end of `str` -static std::string TrimDirname(const std::string& str) { - size_t found = str.find_last_not_of("/"); - if (found == std::string::npos) { - return str; - } - return str.substr(0, found + 1); -} - -// Return pair of a full path. -static std::pair GetDirAndName( - const std::string& name) { - std::string dirname = GetDirName(name); - std::string fname = name.substr(dirname.size() + 1); - return std::make_pair(dirname, fname); -} - -// A basic file truncation function suitable for this test. -Status Truncate(Env* env, const std::string& filename, uint64_t length) { - unique_ptr orig_file; - const EnvOptions options; - Status s = env->NewSequentialFile(filename, &orig_file, options); - if (!s.ok()) { - fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), - s.ToString().c_str()); - return s; - } - - std::unique_ptr scratch(new char[length]); - rocksdb::Slice result; - s = orig_file->Read(length, &result, scratch.get()); -#ifdef OS_WIN - orig_file.reset(); -#endif - if (s.ok()) { - std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; - unique_ptr tmp_file; - s = env->NewWritableFile(tmp_name, &tmp_file, options); - if (s.ok()) { - s = tmp_file->Append(result); - if (s.ok()) { - s = env->RenameFile(tmp_name, filename); - } else { - fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(), - filename.c_str(), s.ToString().c_str()); - env->DeleteFile(tmp_name); - } - } - } - if (!s.ok()) { - fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), - s.ToString().c_str()); - } - - return s; -} - -struct FileState { - std::string filename_; - ssize_t pos_; - ssize_t pos_at_last_sync_; - ssize_t pos_at_last_flush_; - - explicit FileState(const std::string& filename) - : filename_(filename), - pos_(-1), - pos_at_last_sync_(-1), - pos_at_last_flush_(-1) { } - - FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} - - bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } - - Status DropUnsyncedData(Env* env) const; - - Status DropRandomUnsyncedData(Env* env, Random* rand) const; -}; - -} // anonymous namespace - -// A wrapper around WritableFileWriter* file -// is written to or sync'ed. -class TestWritableFile : public WritableFile { - public: - explicit TestWritableFile(const std::string& fname, - unique_ptr&& f, - FaultInjectionTestEnv* env); - virtual ~TestWritableFile(); - virtual Status Append(const Slice& data) override; - virtual Status Truncate(uint64_t size) override { return target_->Truncate(size); } - virtual Status Close() override; - virtual Status Flush() override; - virtual Status Sync() override; - virtual bool IsSyncThreadSafe() const override { return true; } - - private: - FileState state_; - unique_ptr target_; - bool writable_file_opened_; - FaultInjectionTestEnv* env_; -}; - -class TestDirectory : public Directory { - public: - explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, - Directory* dir) - : env_(env), dirname_(dirname), dir_(dir) {} - ~TestDirectory() {} - - virtual Status Fsync() override; - - private: - FaultInjectionTestEnv* env_; - std::string dirname_; - unique_ptr dir_; -}; - -class FaultInjectionTestEnv : public EnvWrapper { - public: - explicit FaultInjectionTestEnv(Env* base) - : EnvWrapper(base), - filesystem_active_(true) {} - virtual ~FaultInjectionTestEnv() { } - - Status NewDirectory(const std::string& name, - unique_ptr* result) override { - unique_ptr r; - Status s = target()->NewDirectory(name, &r); - EXPECT_OK(s); - if (!s.ok()) { - return s; - } - result->reset(new TestDirectory(this, TrimDirname(name), r.release())); - return Status::OK(); - } - - Status NewWritableFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& soptions) override { - if (!IsFilesystemActive()) { - return Status::Corruption("Not Active"); - } - // Not allow overwriting files - Status s = target()->FileExists(fname); - if (s.ok()) { - return Status::Corruption("File already exists."); - } else if (!s.IsNotFound()) { - assert(s.IsIOError()); - return s; - } - s = target()->NewWritableFile(fname, result, soptions); - if (s.ok()) { - result->reset(new TestWritableFile(fname, std::move(*result), this)); - // WritableFileWriter* file is opened - // again then it will be truncated - so forget our saved state. - UntrackFile(fname); - MutexLock l(&mutex_); - open_files_.insert(fname); - auto dir_and_name = GetDirAndName(fname); - auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; - list.insert(dir_and_name.second); - } - return s; - } - - virtual Status DeleteFile(const std::string& f) override { - if (!IsFilesystemActive()) { - return Status::Corruption("Not Active"); - } - Status s = EnvWrapper::DeleteFile(f); - if (!s.ok()) { - fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(), - s.ToString().c_str()); - } - EXPECT_OK(s); - if (s.ok()) { - UntrackFile(f); - } - return s; - } - - virtual Status RenameFile(const std::string& s, - const std::string& t) override { - if (!IsFilesystemActive()) { - return Status::Corruption("Not Active"); - } - Status ret = EnvWrapper::RenameFile(s, t); - - if (ret.ok()) { - MutexLock l(&mutex_); - if (db_file_state_.find(s) != db_file_state_.end()) { - db_file_state_[t] = db_file_state_[s]; - db_file_state_.erase(s); - } - - auto sdn = GetDirAndName(s); - auto tdn = GetDirAndName(t); - if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { - auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; - assert(tlist.find(tdn.second) == tlist.end()); - tlist.insert(tdn.second); - } - } - - return ret; - } - - void WritableFileClosed(const FileState& state) { - MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { - db_file_state_[state.filename_] = state; - open_files_.erase(state.filename_); - } - } - - // For every file that is not fully synced, make a call to `func` with - // FileState of the file as the parameter. - Status DropFileData(std::function func) { - Status s; - MutexLock l(&mutex_); - for (std::map::const_iterator it = - db_file_state_.begin(); - s.ok() && it != db_file_state_.end(); ++it) { - const FileState& state = it->second; - if (!state.IsFullySynced()) { - s = func(target(), state); - } - } - return s; - } - - Status DropUnsyncedFileData() { - return DropFileData([&](Env* env, const FileState& state) { - return state.DropUnsyncedData(env); - }); - } - - Status DropRandomUnsyncedFileData(Random* rnd) { - return DropFileData([&](Env* env, const FileState& state) { - return state.DropRandomUnsyncedData(env, rnd); - }); - } - - Status DeleteFilesCreatedAfterLastDirSync() { - // Because DeleteFile access this container make a copy to avoid deadlock - std::map> map_copy; - { - MutexLock l(&mutex_); - map_copy.insert(dir_to_new_files_since_last_sync_.begin(), - dir_to_new_files_since_last_sync_.end()); - } - - for (auto& pair : map_copy) { - for (std::string name : pair.second) { - Status s = DeleteFile(pair.first + "/" + name); - if (!s.ok()) { - return s; - } - } - } - return Status::OK(); - } - void ResetState() { - MutexLock l(&mutex_); - db_file_state_.clear(); - dir_to_new_files_since_last_sync_.clear(); - SetFilesystemActiveNoLock(true); - } - - void UntrackFile(const std::string& f) { - MutexLock l(&mutex_); - auto dir_and_name = GetDirAndName(f); - dir_to_new_files_since_last_sync_[dir_and_name.first].erase( - dir_and_name.second); - db_file_state_.erase(f); - open_files_.erase(f); - } - - void SyncDir(const std::string& dirname) { - MutexLock l(&mutex_); - dir_to_new_files_since_last_sync_.erase(dirname); - } - - // Setting the filesystem to inactive is the test equivalent to simulating a - // system reset. Setting to inactive will freeze our saved filesystem state so - // that it will stop being recorded. It can then be reset back to the state at - // the time of the reset. - bool IsFilesystemActive() { - MutexLock l(&mutex_); - return filesystem_active_; - } - void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; } - void SetFilesystemActive(bool active) { - MutexLock l(&mutex_); - SetFilesystemActiveNoLock(active); - } - void AssertNoOpenFile() { ASSERT_TRUE(open_files_.empty()); } - - private: - port::Mutex mutex_; - std::map db_file_state_; - std::set open_files_; - std::unordered_map> - dir_to_new_files_since_last_sync_; - bool filesystem_active_; // Record flushes, syncs, writes -}; - -Status FileState::DropUnsyncedData(Env* env) const { - ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; - return Truncate(env, filename_, sync_pos); -} - -Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { - ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; - assert(pos_ >= sync_pos); - int range = static_cast(pos_ - sync_pos); - uint64_t truncated_size = - static_cast(sync_pos) + rand->Uniform(range); - return Truncate(env, filename_, truncated_size); -} - -Status TestDirectory::Fsync() { - env_->SyncDir(dirname_); - return dir_->Fsync(); -} - -TestWritableFile::TestWritableFile(const std::string& fname, - unique_ptr&& f, - FaultInjectionTestEnv* env) - : state_(fname), - target_(std::move(f)), - writable_file_opened_(true), - env_(env) { - assert(target_ != nullptr); - state_.pos_ = 0; -} - -TestWritableFile::~TestWritableFile() { - if (writable_file_opened_) { - Close(); - } -} - -Status TestWritableFile::Append(const Slice& data) { - if (!env_->IsFilesystemActive()) { - return Status::Corruption("Not Active"); - } - Status s = target_->Append(data); - if (s.ok()) { - state_.pos_ += data.size(); - } - return s; -} - -Status TestWritableFile::Close() { - writable_file_opened_ = false; - Status s = target_->Close(); - if (s.ok()) { - env_->WritableFileClosed(state_); - } - return s; -} - -Status TestWritableFile::Flush() { - Status s = target_->Flush(); - if (s.ok() && env_->IsFilesystemActive()) { - state_.pos_at_last_flush_ = state_.pos_; - } - return s; -} - -Status TestWritableFile::Sync() { - if (!env_->IsFilesystemActive()) { - return Status::OK(); - } - // No need to actual sync. - state_.pos_at_last_sync_ = state_.pos_; - return Status::OK(); -} - class FaultInjectionTest : public testing::Test, public testing::WithParamInterface { protected: @@ -647,16 +249,15 @@ class FaultInjectionTest : public testing::Test, return test::RandomString(&r, kValueSize, storage); } - Status OpenDB() { + void CloseDB() { delete db_; db_ = NULL; - env_->ResetState(); - return DB::Open(options_, dbname_, &db_); } - void CloseDB() { - delete db_; - db_ = NULL; + Status OpenDB() { + CloseDB(); + env_->ResetState(); + return DB::Open(options_, dbname_, &db_); } void DeleteAllData() { @@ -735,7 +336,7 @@ class FaultInjectionTest : public testing::Test, } void WaitCompactionFinish() { - static_cast(db_)->TEST_WaitForCompact(); + static_cast(db_->GetRootDB())->TEST_WaitForCompact(); ASSERT_OK(db_->Put(WriteOptions(), "", "")); } }; @@ -786,6 +387,7 @@ TEST_P(FaultInjectionTest, WriteOptionSyncTest) { // Block the job queue to prevent flush job from running. env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::HIGH); + sleeping_task_low.WaitUntilSleeping(); WriteOptions write_options; write_options.sync = false; @@ -869,6 +471,7 @@ TEST_P(FaultInjectionTest, ManualLogSyncTest) { // Block the job queue to prevent flush job from running. env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::HIGH); + sleeping_task_low.WaitUntilSleeping(); WriteOptions write_options; write_options.sync = false; @@ -902,13 +505,7 @@ INSTANTIATE_TEST_CASE_P(FaultTest, FaultInjectionTest, ::testing::Bool()); } // namespace rocksdb -#endif // #if !(defined NDEBUG) || !defined(OS_WIN) - int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/external/rocksdb/db/file_indexer.cc b/external/rocksdb/db/file_indexer.cc index 222cca9c03..9b31c2bd65 100644 --- a/external/rocksdb/db/file_indexer.cc +++ b/external/rocksdb/db/file_indexer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/file_indexer.h b/external/rocksdb/db/file_indexer.h index 3a335bec5c..5eb10bc4dc 100644 --- a/external/rocksdb/db/file_indexer.h +++ b/external/rocksdb/db/file_indexer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,9 +26,9 @@ struct FileLevel; // The file tree structure in Version is prebuilt and the range of each file // is known. On Version::Get(), it uses binary search to find a potential file // and then check if a target key can be found in the file by comparing the key -// to each file's smallest and largest key. The results of these comparisions +// to each file's smallest and largest key. The results of these comparisons // can be reused beyond checking if a key falls into a file's range. -// With some pre-calculated knowledge, each key comparision that has been done +// With some pre-calculated knowledge, each key comparison that has been done // can serve as a hint to narrow down further searches: if a key compared to // be smaller than a file's smallest or largest, that comparison can be used // to find out the right bound of next binary search. Similarly, if a key @@ -48,7 +48,7 @@ class FileIndexer { size_t LevelIndexSize(size_t level) const; // Return a file index range in the next level to search for a key based on - // smallest and largest key comparision for the current file specified by + // smallest and largest key comparison for the current file specified by // level and file_index. When *left_index < *right_index, both index should // be valid and fit in the vector size. void GetNextLevelIndex(const size_t level, const size_t file_index, diff --git a/external/rocksdb/db/file_indexer_test.cc b/external/rocksdb/db/file_indexer_test.cc index 98fea47feb..9b3cdd4d6d 100644 --- a/external/rocksdb/db/file_indexer_test.cc +++ b/external/rocksdb/db/file_indexer_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/filename.cc b/external/rocksdb/db/filename.cc index e152037332..d1f0958bbc 100644 --- a/external/rocksdb/db/filename.cc +++ b/external/rocksdb/db/filename.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,6 +21,8 @@ #include "util/file_reader_writer.h" #include "util/logging.h" #include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/sync_point.h" namespace rocksdb { @@ -46,8 +48,9 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { path[i] == '_'){ dest[write_idx++] = path[i]; } else { - if (i > 0) + if (i > 0) { dest[write_idx++] = '_'; + } } i++; } @@ -145,7 +148,7 @@ std::string LockFileName(const std::string& dbname) { } std::string TempFileName(const std::string& dbname, uint64_t number) { - return MakeFileName(dbname, number, "dbtmp"); + return MakeFileName(dbname, number, kTempFileNameSuffix.c_str()); } InfoLogPrefix::InfoLogPrefix(bool has_log_dir, @@ -185,6 +188,21 @@ std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, return log_dir + "/" + info_log_prefix.buf + ".old." + buf; } +std::string OptionsFileName(const std::string& dbname, uint64_t file_num) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%06" PRIu64, + kOptionsFileNamePrefix.c_str(), file_num); + return dbname + "/" + buffer; +} + +std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s", + kOptionsFileNamePrefix.c_str(), file_num, + kTempFileNameSuffix.c_str()); + return dbname + "/" + buffer; +} + std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { char buf[100]; snprintf(buf, sizeof(buf), "/METADB-%llu", @@ -205,6 +223,8 @@ std::string IdentityFileName(const std::string& dbname) { // dbname/MANIFEST-[0-9]+ // dbname/[0-9]+.(log|sst) // dbname/METADB-[0-9]+ +// dbname/OPTIONS-[0-9]+ +// dbname/OPTIONS-[0-9]+.dbtmp // Disregards / at the beginning bool ParseFileName(const std::string& fname, uint64_t* number, @@ -267,6 +287,21 @@ bool ParseFileName(const std::string& fname, uint64_t* number, } *type = kMetaDatabase; *number = num; + } else if (rest.starts_with(kOptionsFileNamePrefix)) { + uint64_t ts_suffix; + bool is_temp_file = false; + rest.remove_prefix(kOptionsFileNamePrefix.size()); + const std::string kTempFileNameSuffixWithDot = + std::string(".") + kTempFileNameSuffix; + if (rest.ends_with(kTempFileNameSuffixWithDot)) { + rest.remove_suffix(kTempFileNameSuffixWithDot.size()); + is_temp_file = true; + } + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = is_temp_file ? kTempFile : kOptionsFile; } else { // Avoid strtoull() to keep filename format independent of the // current locale @@ -301,7 +336,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number, } else if (suffix == Slice(kRocksDbTFileExt) || suffix == Slice(kLevelDbTFileExt)) { *type = kTableFile; - } else if (suffix == Slice("dbtmp")) { + } else if (suffix == Slice(kTempFileNameSuffix)) { *type = kTempFile; } else { return false; @@ -322,7 +357,9 @@ Status SetCurrentFile(Env* env, const std::string& dbname, std::string tmp = TempFileName(dbname, descriptor_number); Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true); if (s.ok()) { + TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2); s = env->RenameFile(tmp, CurrentFileName(dbname)); + TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2); } if (s.ok()) { if (directory_to_fsync != nullptr) { @@ -351,6 +388,7 @@ Status SetIdentityFile(Env* env, const std::string& dbname) { Status SyncManifest(Env* env, const DBOptions* db_options, WritableFileWriter* file) { + TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2); if (db_options->disableDataSync) { return Status::OK(); } else { diff --git a/external/rocksdb/db/filename.h b/external/rocksdb/db/filename.h index 926f027de9..9a0a1eee33 100644 --- a/external/rocksdb/db/filename.h +++ b/external/rocksdb/db/filename.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -36,7 +36,8 @@ enum FileType { kTempFile, kInfoLogFile, // Either the current one, or an old one kMetaDatabase, - kIdentityFile + kIdentityFile, + kOptionsFile }; // Return the name of the log file with the specified number @@ -114,6 +115,19 @@ extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, const std::string& db_path = "", const std::string& log_dir = ""); +static const std::string kOptionsFileNamePrefix = "OPTIONS-"; +static const std::string kTempFileNameSuffix = "dbtmp"; + +// Return a options file name given the "dbname" and file number. +// Format: OPTIONS-[number].dbtmp +extern std::string OptionsFileName(const std::string& dbname, + uint64_t file_num); + +// Return a temp options file name given the "dbname" and file number. +// Format: OPTIONS-[number] +extern std::string TempOptionsFileName(const std::string& dbname, + uint64_t file_num); + // Return the name to use for a metadatabase. The result will be prefixed with // "dbname". extern std::string MetaDatabaseName(const std::string& dbname, diff --git a/external/rocksdb/db/filename_test.cc b/external/rocksdb/db/filename_test.cc index 2eafd52306..0f8e37e7fd 100644 --- a/external/rocksdb/db/filename_test.cc +++ b/external/rocksdb/db/filename_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/flush_job.cc b/external/rocksdb/db/flush_job.cc index 410108a051..bb38f485dd 100644 --- a/external/rocksdb/db/flush_job.cc +++ b/external/rocksdb/db/flush_job.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -62,10 +62,11 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd, InstrumentedMutex* db_mutex, std::atomic* shutting_down, std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, JobContext* job_context, LogBuffer* log_buffer, Directory* db_directory, Directory* output_file_directory, CompressionType output_compression, Statistics* stats, - EventLogger* event_logger) + EventLogger* event_logger, bool measure_io_stats) : dbname_(dbname), cfd_(cfd), db_options_(db_options), @@ -75,13 +76,16 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd, db_mutex_(db_mutex), shutting_down_(shutting_down), existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), job_context_(job_context), log_buffer_(log_buffer), db_directory_(db_directory), output_file_directory_(output_file_directory), output_compression_(output_compression), stats_(stats), - event_logger_(event_logger) { + event_logger_(event_logger), + measure_io_stats_(measure_io_stats), + pick_memtable_called(false) { // Update the thread status to indicate flush. ReportStartedFlush(); TEST_SYNC_POINT("FlushJob::FlushJob()"); @@ -92,7 +96,8 @@ FlushJob::~FlushJob() { } void FlushJob::ReportStartedFlush() { - ThreadStatusUtil::SetColumnFamily(cfd_); + ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, + cfd_->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); ThreadStatusUtil::SetThreadOperationProperty( ThreadStatus::COMPACTION_JOB_ID, @@ -111,38 +116,70 @@ void FlushJob::ReportFlushInputSize(const autovector& mems) { } void FlushJob::RecordFlushIOStats() { - ThreadStatusUtil::SetThreadOperationProperty( + RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); + ThreadStatusUtil::IncreaseThreadOperationProperty( ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); } -Status FlushJob::Run(FileMetaData* file_meta) { - AutoThreadOperationStageUpdater stage_run( - ThreadStatus::STAGE_FLUSH_RUN); +void FlushJob::PickMemTable() { + db_mutex_->AssertHeld(); + assert(!pick_memtable_called); + pick_memtable_called = true; // Save the contents of the earliest memtable as a new Table - FileMetaData meta; - autovector mems; - cfd_->imm()->PickMemtablesToFlush(&mems); - if (mems.empty()) { - LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush", - cfd_->GetName().c_str()); - return Status::OK(); + cfd_->imm()->PickMemtablesToFlush(&mems_); + if (mems_.empty()) { + return; } - ReportFlushInputSize(mems); + ReportFlushInputSize(mems_); // entries mems are (implicitly) sorted in ascending order by their created // time. We will use the first memtable's `edit` to keep the meta info for // this flush. - MemTable* m = mems[0]; - VersionEdit* edit = m->GetEdits(); - edit->SetPrevLogNumber(0); + MemTable* m = mems_[0]; + edit_ = m->GetEdits(); + edit_->SetPrevLogNumber(0); // SetLogNumber(log_num) indicates logs with number smaller than log_num // will no longer be picked up for recovery. - edit->SetLogNumber(mems.back()->GetNextLogNumber()); - edit->SetColumnFamily(cfd_->GetID()); + edit_->SetLogNumber(mems_.back()->GetNextLogNumber()); + edit_->SetColumnFamily(cfd_->GetID()); + + // path 0 for level 0 file. + meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + + base_ = cfd_->current(); + base_->Ref(); // it is likely that we do not need this reference +} + +Status FlushJob::Run(FileMetaData* file_meta) { + db_mutex_->AssertHeld(); + assert(pick_memtable_called); + AutoThreadOperationStageUpdater stage_run( + ThreadStatus::STAGE_FLUSH_RUN); + if (mems_.empty()) { + LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush", + cfd_->GetName().c_str()); + return Status::OK(); + } + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTime); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + } // This will release and re-acquire the mutex. - Status s = WriteLevel0Table(mems, edit, &meta); + Status s = WriteLevel0Table(); if (s.ok() && (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) { @@ -151,18 +188,18 @@ Status FlushJob::Run(FileMetaData* file_meta) { } if (!s.ok()) { - cfd_->imm()->RollbackMemtableFlush(mems, meta.fd.GetNumber()); + cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber()); } else { TEST_SYNC_POINT("FlushJob::InstallResults"); // Replace immutable memtable with the generated Table s = cfd_->imm()->InstallMemtableFlushResults( - cfd_, mutable_cf_options_, mems, versions_, db_mutex_, - meta.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, + cfd_, mutable_cf_options_, mems_, versions_, db_mutex_, + meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, log_buffer_); } if (s.ok() && file_meta != nullptr) { - *file_meta = meta; + *file_meta = meta_; } RecordFlushIOStats(); @@ -176,34 +213,41 @@ Status FlushJob::Run(FileMetaData* file_meta) { stream << vstorage->NumLevelFiles(level); } stream.EndArray(); + stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed(); + + if (measure_io_stats_) { + if (prev_perf_level != PerfLevel::kEnableTime) { + SetPerfLevel(prev_perf_level); + } + stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos); + stream << "file_range_sync_nanos" + << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos); + stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos); + stream << "file_prepare_write_nanos" + << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos); + } return s; } -Status FlushJob::WriteLevel0Table(const autovector& mems, - VersionEdit* edit, FileMetaData* meta) { +Status FlushJob::WriteLevel0Table() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_FLUSH_WRITE_L0); db_mutex_->AssertHeld(); const uint64_t start_micros = db_options_.env->NowMicros(); - // path 0 for level 0 file. - meta->fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); - - Version* base = cfd_->current(); - base->Ref(); // it is likely that we do not need this reference Status s; { db_mutex_->Unlock(); if (log_buffer_) { log_buffer_->FlushBufferToLog(); } - std::vector memtables; + std::vector memtables; ReadOptions ro; ro.total_order_seek = true; Arena arena; uint64_t total_num_entries = 0, total_num_deletes = 0; size_t total_memory_usage = 0; - for (MemTable* m : mems) { + for (MemTable* m : mems_) { Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); @@ -215,85 +259,71 @@ Status FlushJob::WriteLevel0Table(const autovector& mems, event_logger_->Log() << "job" << job_context_->job_id << "event" << "flush_started" - << "num_memtables" << mems.size() << "num_entries" + << "num_memtables" << mems_.size() << "num_entries" << total_num_entries << "num_deletes" << total_num_deletes << "memory_usage" << total_memory_usage; - TableFileCreationInfo info; { ScopedArenaIterator iter( NewMergingIterator(&cfd_->internal_comparator(), &memtables[0], static_cast(memtables.size()), &arena)); Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", - cfd_->GetName().c_str(), job_context_->job_id, meta->fd.GetNumber()); + cfd_->GetName().c_str(), job_context_->job_id, meta_.fd.GetNumber()); TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", &output_compression_); s = BuildTable( - dbname_, db_options_.env, *cfd_->ioptions(), env_options_, - cfd_->table_cache(), iter.get(), meta, cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), existing_snapshots_, - output_compression_, cfd_->ioptions()->compression_opts, + dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_, + env_options_, cfd_->table_cache(), iter.get(), &meta_, + cfd_->internal_comparator(), cfd_->int_tbl_prop_collector_factories(), + cfd_->GetID(), cfd_->GetName(), existing_snapshots_, + earliest_write_conflict_snapshot_, output_compression_, + cfd_->ioptions()->compression_opts, mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), - Env::IO_HIGH, &info.table_properties); + TableFileCreationReason::kFlush, event_logger_, job_context_->job_id, + Env::IO_HIGH, &table_properties_, 0 /* level */); LogFlush(db_options_.info_log); } Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s" "%s", - cfd_->GetName().c_str(), job_context_->job_id, meta->fd.GetNumber(), - meta->fd.GetFileSize(), s.ToString().c_str(), - meta->marked_for_compaction ? " (needs compaction)" : ""); - - // output to event logger - if (s.ok()) { - info.db_name = dbname_; - info.cf_name = cfd_->GetName(); - info.file_path = TableFileName(db_options_.db_paths, - meta->fd.GetNumber(), - meta->fd.GetPathId()); - info.file_size = meta->fd.GetFileSize(); - info.job_id = job_context_->job_id; - EventHelpers::LogAndNotifyTableFileCreation( - event_logger_, db_options_.listeners, - meta->fd, info); - TEST_SYNC_POINT("FlushJob::LogAndNotifyTableFileCreation()"); - } + cfd_->GetName().c_str(), job_context_->job_id, meta_.fd.GetNumber(), + meta_.fd.GetFileSize(), s.ToString().c_str(), + meta_.marked_for_compaction ? " (needs compaction)" : ""); if (!db_options_.disableDataSync && output_file_directory_ != nullptr) { output_file_directory_->Fsync(); } + TEST_SYNC_POINT("FlushJob::WriteLevel0Table"); db_mutex_->Lock(); } - base->Unref(); - - // re-acquire the most current version - base = cfd_->current(); + base_->Unref(); // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. - if (s.ok() && meta->fd.GetFileSize() > 0) { + if (s.ok() && meta_.fd.GetFileSize() > 0) { // if we have more than 1 background thread, then we cannot // insert files directly into higher levels because some other // threads could be concurrently producing compacted files for // that key range. // Add file to L0 - edit->AddFile(0 /* level */, meta->fd.GetNumber(), meta->fd.GetPathId(), - meta->fd.GetFileSize(), meta->smallest, meta->largest, - meta->smallest_seqno, meta->largest_seqno, - meta->marked_for_compaction); + edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), + meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, + meta_.smallest_seqno, meta_.largest_seqno, + meta_.marked_for_compaction); } + // Note that here we treat flush as level 0 compaction in internal stats InternalStats::CompactionStats stats(1); stats.micros = db_options_.env->NowMicros() - start_micros; - stats.bytes_written = meta->fd.GetFileSize(); + stats.bytes_written = meta_.fd.GetFileSize(); cfd_->internal_stats()->AddCompactionStats(0 /* level */, stats); cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, - meta->fd.GetFileSize()); - RecordTick(stats_, COMPACT_WRITE_BYTES, meta->fd.GetFileSize()); + meta_.fd.GetFileSize()); + RecordFlushIOStats(); return s; } diff --git a/external/rocksdb/db/flush_job.h b/external/rocksdb/db/flush_job.h index 14555ef56f..5dc6a98710 100644 --- a/external/rocksdb/db/flush_job.h +++ b/external/rocksdb/db/flush_job.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,28 +16,28 @@ #include #include -#include "db/dbformat.h" #include "db/column_family.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" #include "db/log_writer.h" #include "db/memtable_list.h" #include "db/snapshot_impl.h" #include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/event_logger.h" #include "util/instrumented_mutex.h" #include "util/stop_watch.h" #include "util/thread_local.h" -#include "util/scoped_arena_iterator.h" -#include "db/internal_stats.h" -#include "db/write_controller.h" -#include "db/flush_scheduler.h" -#include "db/write_thread.h" -#include "db/job_context.h" namespace rocksdb { @@ -58,21 +58,24 @@ class FlushJob { const EnvOptions& env_options, VersionSet* versions, InstrumentedMutex* db_mutex, std::atomic* shutting_down, std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, JobContext* job_context, LogBuffer* log_buffer, Directory* db_directory, Directory* output_file_directory, CompressionType output_compression, Statistics* stats, - EventLogger* event_logger); + EventLogger* event_logger, bool measure_io_stats); ~FlushJob(); + // Require db_mutex held + void PickMemTable(); Status Run(FileMetaData* file_meta = nullptr); + TableProperties GetTableProperties() const { return table_properties_; } private: void ReportStartedFlush(); void ReportFlushInputSize(const autovector& mems); void RecordFlushIOStats(); - Status WriteLevel0Table(const autovector& mems, VersionEdit* edit, - FileMetaData* meta); + Status WriteLevel0Table(); const std::string& dbname_; ColumnFamilyData* cfd_; const DBOptions& db_options_; @@ -82,6 +85,7 @@ class FlushJob { InstrumentedMutex* db_mutex_; std::atomic* shutting_down_; std::vector existing_snapshots_; + SequenceNumber earliest_write_conflict_snapshot_; JobContext* job_context_; LogBuffer* log_buffer_; Directory* db_directory_; @@ -89,6 +93,15 @@ class FlushJob { CompressionType output_compression_; Statistics* stats_; EventLogger* event_logger_; + TableProperties table_properties_; + bool measure_io_stats_; + + // Variables below are set by PickMemTable(): + FileMetaData meta_; + autovector mems_; + VersionEdit* edit_; + Version* base_; + bool pick_memtable_called; }; } // namespace rocksdb diff --git a/external/rocksdb/db/flush_job_test.cc b/external/rocksdb/db/flush_job_test.cc index d2c423c366..9648c01e4a 100644 --- a/external/rocksdb/db/flush_job_test.cc +++ b/external/rocksdb/db/flush_job_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,16 +7,16 @@ #include #include -#include "db/flush_job.h" #include "db/column_family.h" +#include "db/flush_job.h" #include "db/version_set.h" -#include "db/writebuffer.h" #include "rocksdb/cache.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" #include "util/file_reader_writer.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" -#include "table/mock_table.h" namespace rocksdb { @@ -29,9 +29,9 @@ class FlushJobTest : public testing::Test { : env_(Env::Default()), dbname_(test::TmpDir() + "/flush_job_test"), table_cache_(NewLRUCache(50000, 16)), - write_buffer_(db_options_.db_write_buffer_size), + write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_, + table_cache_.get(), &write_buffer_manager_, &write_controller_)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()) { @@ -61,7 +61,7 @@ class FlushJobTest : public testing::Test { unique_ptr file_writer( new WritableFileWriter(std::move(file), EnvOptions())); { - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); @@ -77,7 +77,7 @@ class FlushJobTest : public testing::Test { std::shared_ptr table_cache_; WriteController write_controller_; DBOptions db_options_; - WriteBuffer write_buffer_; + WriteBufferManager write_buffer_manager_; ColumnFamilyOptions cf_options_; std::unique_ptr versions_; InstrumentedMutex mutex_; @@ -92,9 +92,13 @@ TEST_F(FlushJobTest, Empty) { FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, *cfd->GetLatestMutableCFOptions(), env_options_, versions_.get(), &mutex_, &shutting_down_, - {}, &job_context, nullptr, nullptr, nullptr, - kNoCompression, nullptr, &event_logger); - ASSERT_OK(flush_job.Run()); + {}, kMaxSequenceNumber, &job_context, nullptr, nullptr, + nullptr, kNoCompression, nullptr, &event_logger, false); + { + InstrumentedMutexLock l(&mutex_); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run()); + } job_context.Clean(); } @@ -131,10 +135,11 @@ TEST_F(FlushJobTest, NonEmpty) { FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, *cfd->GetLatestMutableCFOptions(), env_options_, versions_.get(), &mutex_, &shutting_down_, - {}, &job_context, nullptr, nullptr, nullptr, - kNoCompression, nullptr, &event_logger); + {}, kMaxSequenceNumber, &job_context, nullptr, nullptr, + nullptr, kNoCompression, nullptr, &event_logger, true); FileMetaData fd; mutex_.Lock(); + flush_job.PickMemTable(); ASSERT_OK(flush_job.Run(&fd)); mutex_.Unlock(); ASSERT_EQ(ToString(0), fd.smallest.user_key().ToString()); @@ -192,12 +197,13 @@ TEST_F(FlushJobTest, Snapshots) { } EventLogger event_logger(db_options_.info_log.get()); - FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), - db_options_, *cfd->GetLatestMutableCFOptions(), - env_options_, versions_.get(), &mutex_, &shutting_down_, - snapshots, &job_context, nullptr, nullptr, nullptr, - kNoCompression, nullptr, &event_logger); + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), env_options_, versions_.get(), &mutex_, + &shutting_down_, snapshots, kMaxSequenceNumber, &job_context, nullptr, + nullptr, nullptr, kNoCompression, nullptr, &event_logger, true); mutex_.Lock(); + flush_job.PickMemTable(); ASSERT_OK(flush_job.Run()); mutex_.Unlock(); mock_table_factory_->AssertSingleFile(inserted_keys); diff --git a/external/rocksdb/db/flush_scheduler.cc b/external/rocksdb/db/flush_scheduler.cc index 56816159e2..a961f7f0bc 100644 --- a/external/rocksdb/db/flush_scheduler.cc +++ b/external/rocksdb/db/flush_scheduler.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,51 +13,72 @@ namespace rocksdb { void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { #ifndef NDEBUG - assert(column_families_set_.find(cfd) == column_families_set_.end()); - column_families_set_.insert(cfd); + { + std::lock_guard lock(checking_mutex_); + assert(checking_set_.count(cfd) == 0); + checking_set_.insert(cfd); + } #endif // NDEBUG cfd->Ref(); - column_families_.push_back(cfd); +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ + Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)}; + while (!head_.compare_exchange_strong( + node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) { + // failing CAS updates the first param, so we are already set for + // retry. TakeNextColumnFamily won't happen until after another + // inter-thread synchronization, so we don't even need release + // semantics for this CAS + } +#endif // __clang_analyzer__ } -ColumnFamilyData* FlushScheduler::GetNextColumnFamily() { - ColumnFamilyData* cfd = nullptr; - while (column_families_.size() > 0) { - cfd = column_families_.front(); - column_families_.pop_front(); - if (cfd->IsDropped()) { - if (cfd->Unref()) { - delete cfd; - cfd = nullptr; - } - } else { - break; +ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { + while (true) { + if (Empty()) { + return nullptr; } - } + + // dequeue the head + Node* node = head_.load(std::memory_order_relaxed); + head_.store(node->next, std::memory_order_relaxed); + ColumnFamilyData* cfd = node->column_family; + delete node; + #ifndef NDEBUG - if (cfd != nullptr) { - auto itr = column_families_set_.find(cfd); - assert(itr != column_families_set_.end()); - column_families_set_.erase(itr); - } + { + auto iter = checking_set_.find(cfd); + assert(iter != checking_set_.end()); + checking_set_.erase(iter); + } #endif // NDEBUG - return cfd; + + if (!cfd->IsDropped()) { + // success + return cfd; + } + + // no longer relevant, retry + if (cfd->Unref()) { + delete cfd; + } + } } -bool FlushScheduler::Empty() { return column_families_.empty(); } +bool FlushScheduler::Empty() { + auto rv = head_.load(std::memory_order_relaxed) == nullptr; + assert(rv == checking_set_.empty()); + return rv; +} void FlushScheduler::Clear() { - for (auto cfd : column_families_) { -#ifndef NDEBUG - auto itr = column_families_set_.find(cfd); - assert(itr != column_families_set_.end()); - column_families_set_.erase(itr); -#endif // NDEBUG + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { if (cfd->Unref()) { delete cfd; } } - column_families_.clear(); + assert(Empty()); } } // namespace rocksdb diff --git a/external/rocksdb/db/flush_scheduler.h b/external/rocksdb/db/flush_scheduler.h index 0c96709b9f..820bd7b71c 100644 --- a/external/rocksdb/db/flush_scheduler.h +++ b/external/rocksdb/db/flush_scheduler.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,34 +6,42 @@ #pragma once #include -#include +#include +#include #include -#include namespace rocksdb { class ColumnFamilyData; -// This class is thread-compatible. It's should only be accessed from single -// write thread (between BeginWrite() and EndWrite()) +// Unless otherwise noted, all methods on FlushScheduler should be called +// only with the DB mutex held or from a single-threaded recovery context. class FlushScheduler { public: - FlushScheduler() = default; - ~FlushScheduler() = default; + FlushScheduler() : head_(nullptr) {} + // May be called from multiple threads at once, but not concurrent with + // any other method calls on this instance void ScheduleFlush(ColumnFamilyData* cfd); - // Returns Ref()-ed column family. Client needs to Unref() - // REQUIRES: db mutex is held (exception is single-threaded recovery) - ColumnFamilyData* GetNextColumnFamily(); + + // Removes and returns Ref()-ed column family. Client needs to Unref(). + // Filters column families that have been dropped. + ColumnFamilyData* TakeNextColumnFamily(); bool Empty(); void Clear(); private: - std::deque column_families_; + struct Node { + ColumnFamilyData* column_family; + Node* next; + }; + + std::atomic head_; #ifndef NDEBUG - std::set column_families_set_; + std::mutex checking_mutex_; + std::set checking_set_; #endif // NDEBUG }; diff --git a/external/rocksdb/db/forward_iterator.cc b/external/rocksdb/db/forward_iterator.cc index c0d7647c5a..f7eb8ca248 100644 --- a/external/rocksdb/db/forward_iterator.cc +++ b/external/rocksdb/db/forward_iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,15 +10,16 @@ #include #include -#include "db/job_context.h" +#include "db/column_family.h" #include "db/db_impl.h" #include "db/db_iter.h" -#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/job_context.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/merger.h" -#include "db/dbformat.h" +#include "util/string_util.h" #include "util/sync_point.h" namespace rocksdb { @@ -28,7 +29,7 @@ namespace rocksdb { // iter.SetFileIndex(file_index); // iter.Seek(target); // iter.Next() -class LevelIterator : public Iterator { +class LevelIterator : public InternalIterator { public: LevelIterator(const ColumnFamilyData* const cfd, const ReadOptions& read_options, @@ -113,7 +114,7 @@ class LevelIterator : public Iterator { bool valid_; uint32_t file_index_; Status status_; - std::unique_ptr file_iter_; + std::unique_ptr file_iter_; }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, @@ -144,12 +145,33 @@ ForwardIterator::~ForwardIterator() { Cleanup(true); } +void ForwardIterator::SVCleanup() { + if (sv_ != nullptr && sv_->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + db_->mutex_.Lock(); + sv_->Cleanup(); + db_->FindObsoleteFiles(&job_context, false, true); + if (read_options_.background_purge_on_iterator_cleanup) { + db_->ScheduleBgLogWriterClose(&job_context); + } + db_->mutex_.Unlock(); + delete sv_; + if (job_context.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles( + job_context, read_options_.background_purge_on_iterator_cleanup); + } + job_context.Clean(); + } +} + void ForwardIterator::Cleanup(bool release_sv) { if (mutable_iter_ != nullptr) { - mutable_iter_->~Iterator(); + mutable_iter_->~InternalIterator(); } for (auto* m : imm_iters_) { - m->~Iterator(); + m->~InternalIterator(); } imm_iters_.clear(); for (auto* f : l0_iters_) { @@ -162,20 +184,7 @@ void ForwardIterator::Cleanup(bool release_sv) { level_iters_.clear(); if (release_sv) { - if (sv_ != nullptr && sv_->Unref()) { - // Job id == 0 means that this is not our background process, but rather - // user thread - JobContext job_context(0); - db_->mutex_.Lock(); - sv_->Cleanup(); - db_->FindObsoleteFiles(&job_context, false, true); - db_->mutex_.Unlock(); - delete sv_; - if (job_context.HaveSomethingToDelete()) { - db_->PurgeObsoleteFiles(job_context); - } - job_context.Clean(); - } + SVCleanup(); } } @@ -185,9 +194,10 @@ bool ForwardIterator::Valid() const { } void ForwardIterator::SeekToFirst() { - if (sv_ == nullptr || - sv_ ->version_number != cfd_->GetSuperVersionNumber()) { + if (sv_ == nullptr) { RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); } else if (immutable_status_.IsIncomplete()) { ResetIncompleteIterators(); } @@ -205,9 +215,10 @@ void ForwardIterator::Seek(const Slice& internal_key) { if (IsOverUpperBound(internal_key)) { valid_ = false; } - if (sv_ == nullptr || - sv_ ->version_number != cfd_->GetSuperVersionNumber()) { + if (sv_ == nullptr) { RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); } else if (immutable_status_.IsIncomplete()) { ResetIncompleteIterators(); } @@ -227,7 +238,9 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, // an option to turn it off. if (seek_to_first || NeedToSeekImmutable(internal_key)) { immutable_status_ = Status::OK(); - if (has_iter_trimmed_for_upper_bound_) { + if ((has_iter_trimmed_for_upper_bound_) && + (cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key_.GetKey(), internal_key) > 0)) { // Some iterators are trimmed. Need to rebuild. RebuildIterators(true); // Already seeked mutable iter, so seek again @@ -254,7 +267,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } const VersionStorageInfo* vstorage = sv_->current->storage_info(); const std::vector& l0 = vstorage->LevelFiles(0); - for (uint32_t i = 0; i < l0.size(); ++i) { + for (size_t i = 0; i < l0.size(); ++i) { if (!l0_iters_[i]) { continue; } @@ -393,7 +406,11 @@ void ForwardIterator::Next() { std::string current_key = key().ToString(); Slice old_key(current_key.data(), current_key.size()); - RebuildIterators(true); + if (sv_ == nullptr) { + RebuildIterators(true); + } else { + RenewIterators(); + } SeekInternal(old_key, false); if (!valid_ || key().compare(old_key) != 0) { return; @@ -459,6 +476,15 @@ Status ForwardIterator::status() const { return immutable_status_; } +Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { + assert(prop != nullptr); + if (prop_name == "rocksdb.iterator.super-version-number") { + *prop = ToString(sv_->version_number); + return Status::OK(); + } + return Status::InvalidArgument(); +} + void ForwardIterator::RebuildIterators(bool refresh_sv) { // Clean up Cleanup(refresh_sv); @@ -484,10 +510,81 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { l0_iters_.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd)); } + BuildLevelIterators(vstorage); + current_ = nullptr; + is_prev_set_ = false; +} + +void ForwardIterator::RenewIterators() { + SuperVersion* svnew; + assert(sv_); + svnew = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); + + if (mutable_iter_ != nullptr) { + mutable_iter_->~InternalIterator(); + } + for (auto* m : imm_iters_) { + m->~InternalIterator(); + } + imm_iters_.clear(); + + mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); + svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + const auto* vstorage_new = svnew->current->storage_info(); + const auto& l0_files_new = vstorage_new->LevelFiles(0); + size_t iold, inew; + bool found; + std::vector l0_iters_new; + l0_iters_new.reserve(l0_files_new.size()); + + for (inew = 0; inew < l0_files_new.size(); inew++) { + found = false; + for (iold = 0; iold < l0_files.size(); iold++) { + if (l0_files[iold] == l0_files_new[inew]) { + found = true; + break; + } + } + if (found) { + if (l0_iters_[iold] == nullptr) { + l0_iters_new.push_back(nullptr); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this); + } else { + l0_iters_new.push_back(l0_iters_[iold]); + l0_iters_[iold] = nullptr; + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this); + } + continue; + } + l0_iters_new.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + l0_files_new[inew]->fd)); + } + + for (auto* f : l0_iters_) { + delete f; + } + l0_iters_.clear(); + l0_iters_ = l0_iters_new; + + for (auto* l : level_iters_) { + delete l; + } + level_iters_.clear(); + BuildLevelIterators(vstorage_new); + current_ = nullptr; + is_prev_set_ = false; + SVCleanup(); + sv_ = svnew; +} + +void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) { level_iters_.reserve(vstorage->num_levels() - 1); for (int32_t level = 1; level < vstorage->num_levels(); ++level) { const auto& level_files = vstorage->LevelFiles(level); - if ((level_files.empty()) || ((read_options_.iterate_upper_bound != nullptr) && (user_comparator_->Compare(*read_options_.iterate_upper_bound, @@ -502,14 +599,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { new LevelIterator(cfd_, read_options_, level_files)); } } - - current_ = nullptr; - is_prev_set_ = false; } void ForwardIterator::ResetIncompleteIterators() { const auto& l0_files = sv_->current->storage_info()->LevelFiles(0); - for (uint32_t i = 0; i < l0_iters_.size(); ++i) { + for (size_t i = 0; i < l0_iters_.size(); ++i) { assert(i < l0_files.size()); if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) { continue; @@ -600,7 +694,7 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { void ForwardIterator::DeleteCurrentIter() { const VersionStorageInfo* vstorage = sv_->current->storage_info(); const std::vector& l0 = vstorage->LevelFiles(0); - for (uint32_t i = 0; i < l0.size(); ++i) { + for (size_t i = 0; i < l0.size(); ++i) { if (!l0_iters_[i]) { continue; } @@ -632,7 +726,7 @@ bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters, const VersionStorageInfo* vstorage = sv_->current->storage_info(); const std::vector& l0 = vstorage->LevelFiles(0); - for (uint32_t i = 0; i < l0.size(); ++i) { + for (size_t i = 0; i < l0.size(); ++i) { if (!l0_iters_[i]) { retval = true; deleted_iters++; diff --git a/external/rocksdb/db/forward_iterator.h b/external/rocksdb/db/forward_iterator.h index e6ef0bdfcb..b5beeceefc 100644 --- a/external/rocksdb/db/forward_iterator.h +++ b/external/rocksdb/db/forward_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,6 +14,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" +#include "table/internal_iterator.h" #include "util/arena.h" namespace rocksdb { @@ -23,6 +24,7 @@ class Env; struct SuperVersion; class ColumnFamilyData; class LevelIterator; +class VersionStorageInfo; struct FileMetaData; class MinIterComparator { @@ -30,16 +32,15 @@ class MinIterComparator { explicit MinIterComparator(const Comparator* comparator) : comparator_(comparator) {} - bool operator()(Iterator* a, Iterator* b) { + bool operator()(InternalIterator* a, InternalIterator* b) { return comparator_->Compare(a->key(), b->key()) > 0; } private: const Comparator* comparator_; }; -typedef std::priority_queue, - MinIterComparator> MinIterHeap; +typedef std::priority_queue, + MinIterComparator> MinIterHeap; /** * ForwardIterator is a special type of iterator that only supports Seek() @@ -48,7 +49,7 @@ typedef std::priority_queue imm_iters_; - std::vector l0_iters_; + InternalIterator* mutable_iter_; + std::vector imm_iters_; + std::vector l0_iters_; std::vector level_iters_; - Iterator* current_; + InternalIterator* current_; bool valid_; // Internal iterator status; set only by one of the unsupported methods. diff --git a/external/rocksdb/db/forward_iterator_bench.cc b/external/rocksdb/db/forward_iterator_bench.cc new file mode 100644 index 0000000000..0f44a9e448 --- /dev/null +++ b/external/rocksdb/db/forward_iterator_bench.cc @@ -0,0 +1,374 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#if !defined(GFLAGS) || defined(ROCKSDB_LITE) +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#elif defined(OS_MACOSX) || defined(OS_WIN) +// Block forward_iterator_bench under MAC and Windows +int main() { return 0; } +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "util/testharness.h" + +const int MAX_SHARDS = 100000; + +DEFINE_int32(writers, 8, ""); +DEFINE_int32(readers, 8, ""); +DEFINE_int64(rate, 100000, ""); +DEFINE_int64(value_size, 300, ""); +DEFINE_int64(shards, 1000, ""); +DEFINE_int64(memtable_size, 500000000, ""); +DEFINE_int64(block_cache_size, 300000000, ""); +DEFINE_int64(block_size, 65536, ""); +DEFINE_double(runtime, 300.0, ""); +DEFINE_bool(cache_only_first, true, ""); +DEFINE_bool(iterate_upper_bound, true, ""); + +struct Stats { + char pad1[128] __attribute__((__unused__)); + std::atomic written{0}; + char pad2[128] __attribute__((__unused__)); + std::atomic read{0}; + std::atomic cache_misses{0}; + char pad3[128] __attribute__((__unused__)); +} stats; + +struct Key { + Key() {} + Key(uint64_t shard_in, uint64_t seqno_in) + : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {} + + uint64_t shard() const { return be64toh(shard_be); } + uint64_t seqno() const { return be64toh(seqno_be); } + + private: + uint64_t shard_be; + uint64_t seqno_be; +} __attribute__((__packed__)); + +struct Reader; +struct Writer; + +struct ShardState { + char pad1[128] __attribute__((__unused__)); + std::atomic last_written{0}; + Writer* writer; + Reader* reader; + char pad2[128] __attribute__((__unused__)); + std::atomic last_read{0}; + std::unique_ptr it; + std::unique_ptr it_cacheonly; + Key upper_bound; + rocksdb::Slice upper_bound_slice; + char pad3[128] __attribute__((__unused__)); +}; + +struct Reader { + public: + explicit Reader(std::vector* shard_states, rocksdb::DB* db) + : shard_states_(shard_states), db_(db) { + sem_init(&sem_, 0, 0); + thread_ = std::thread(&Reader::run, this); + } + + void run() { + while (1) { + sem_wait(&sem_); + if (done_.load()) { + break; + } + + uint64_t shard; + { + std::lock_guard guard(queue_mutex_); + assert(!shards_pending_queue_.empty()); + shard = shards_pending_queue_.front(); + shards_pending_queue_.pop(); + shards_pending_set_.reset(shard); + } + readOnceFromShard(shard); + } + } + + void readOnceFromShard(uint64_t shard) { + ShardState& state = (*shard_states_)[shard]; + if (!state.it) { + // Initialize iterators + rocksdb::ReadOptions options; + options.tailing = true; + if (FLAGS_iterate_upper_bound) { + state.upper_bound = Key(shard, std::numeric_limits::max()); + state.upper_bound_slice = rocksdb::Slice( + (const char*)&state.upper_bound, sizeof(state.upper_bound)); + options.iterate_upper_bound = &state.upper_bound_slice; + } + + state.it.reset(db_->NewIterator(options)); + + if (FLAGS_cache_only_first) { + options.read_tier = rocksdb::ReadTier::kBlockCacheTier; + state.it_cacheonly.reset(db_->NewIterator(options)); + } + } + + const uint64_t upto = state.last_written.load(); + for (rocksdb::Iterator* it : {state.it_cacheonly.get(), state.it.get()}) { + if (it == nullptr) { + continue; + } + if (state.last_read.load() >= upto) { + break; + } + bool need_seek = true; + for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) { + if (need_seek) { + Key from(shard, state.last_read.load() + 1); + it->Seek(rocksdb::Slice((const char*)&from, sizeof(from))); + need_seek = false; + } else { + it->Next(); + } + if (it->status().IsIncomplete()) { + ++::stats.cache_misses; + break; + } + assert(it->Valid()); + assert(it->key().size() == sizeof(Key)); + Key key; + memcpy(&key, it->key().data(), it->key().size()); + // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n", + // shard, seq, key.shard(), key.seqno()); + assert(key.shard() == shard); + assert(key.seqno() == seq); + state.last_read.store(seq); + ++::stats.read; + } + } + } + + void onWrite(uint64_t shard) { + { + std::lock_guard guard(queue_mutex_); + if (!shards_pending_set_.test(shard)) { + shards_pending_queue_.push(shard); + shards_pending_set_.set(shard); + sem_post(&sem_); + } + } + } + + ~Reader() { + done_.store(true); + sem_post(&sem_); + thread_.join(); + } + + private: + char pad1[128] __attribute__((__unused__)); + std::vector* shard_states_; + rocksdb::DB* db_; + std::thread thread_; + sem_t sem_; + std::mutex queue_mutex_; + std::bitset shards_pending_set_; + std::queue shards_pending_queue_; + std::atomic done_{false}; + char pad2[128] __attribute__((__unused__)); +}; + +struct Writer { + explicit Writer(std::vector* shard_states, rocksdb::DB* db) + : shard_states_(shard_states), db_(db) {} + + void start() { thread_ = std::thread(&Writer::run, this); } + + void run() { + std::queue workq; + std::chrono::steady_clock::time_point deadline( + std::chrono::steady_clock::now() + + std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime))); + std::vector my_shards; + for (int i = 1; i <= FLAGS_shards; ++i) { + if ((*shard_states_)[i].writer == this) { + my_shards.push_back(i); + } + } + + std::mt19937 rng{std::random_device()()}; + std::uniform_int_distribution shard_dist( + 0, static_cast(my_shards.size()) - 1); + std::string value(FLAGS_value_size, '*'); + + while (1) { + auto now = std::chrono::steady_clock::now(); + if (FLAGS_runtime >= 0 && now >= deadline) { + break; + } + if (workq.empty()) { + for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) { + std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate); + workq.push(now + offset); + } + } + while (!workq.empty() && workq.front() < now) { + workq.pop(); + uint64_t shard = my_shards[shard_dist(rng)]; + ShardState& state = (*shard_states_)[shard]; + uint64_t seqno = state.last_written.load() + 1; + Key key(shard, seqno); + // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno); + rocksdb::Status status = + db_->Put(rocksdb::WriteOptions(), + rocksdb::Slice((const char*)&key, sizeof(key)), + rocksdb::Slice(value)); + assert(status.ok()); + state.last_written.store(seqno); + state.reader->onWrite(shard); + ++::stats.written; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // fprintf(stderr, "Writer done\n"); + } + + ~Writer() { thread_.join(); } + + private: + char pad1[128] __attribute__((__unused__)); + std::vector* shard_states_; + rocksdb::DB* db_; + std::thread thread_; + char pad2[128] __attribute__((__unused__)); +}; + +struct StatsThread { + explicit StatsThread(rocksdb::DB* db) + : db_(db), thread_(&StatsThread::run, this) {} + + void run() { + // using namespace std::chrono; + auto tstart = std::chrono::steady_clock::now(), tlast = tstart; + uint64_t wlast = 0, rlast = 0; + while (!done_.load()) { + { + std::unique_lock lock(cvm_); + cv_.wait_for(lock, std::chrono::seconds(1)); + } + auto now = std::chrono::steady_clock::now(); + double elapsed = + std::chrono::duration_cast >( + now - tlast).count(); + uint64_t w = ::stats.written.load(); + uint64_t r = ::stats.read.load(); + fprintf(stderr, + "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | " + "r/s %10.0f | cache misses %10ld\n", + db_->GetEnv()->TimeToString(time(nullptr)).c_str(), + std::chrono::duration_cast(now - tstart) + .count(), + w, (w - wlast) / elapsed, r, (r - rlast) / elapsed, + ::stats.cache_misses.load()); + wlast = w; + rlast = r; + tlast = now; + } + } + + ~StatsThread() { + { + std::lock_guard guard(cvm_); + done_.store(true); + } + cv_.notify_all(); + thread_.join(); + } + + private: + rocksdb::DB* db_; + std::mutex cvm_; + std::condition_variable cv_; + std::thread thread_; + std::atomic done_{false}; +}; + +int main(int argc, char** argv) { + GFLAGS::ParseCommandLineFlags(&argc, &argv, true); + + std::mt19937 rng{std::random_device()()}; + rocksdb::Status status; + std::string path = rocksdb::test::TmpDir() + "/forward_iterator_test"; + fprintf(stderr, "db path is %s\n", path.c_str()); + rocksdb::Options options; + options.create_if_missing = true; + options.compression = rocksdb::CompressionType::kNoCompression; + options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 99999; + options.level0_stop_writes_trigger = 99999; + options.allow_os_buffer = false; + options.write_buffer_size = FLAGS_memtable_size; + rocksdb::BlockBasedTableOptions table_options; + table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size); + table_options.block_size = FLAGS_block_size; + options.table_factory.reset( + rocksdb::NewBlockBasedTableFactory(table_options)); + + status = rocksdb::DestroyDB(path, options); + assert(status.ok()); + rocksdb::DB* db_raw; + status = rocksdb::DB::Open(options, path, &db_raw); + assert(status.ok()); + std::unique_ptr db(db_raw); + + std::vector shard_states(FLAGS_shards + 1); + std::deque readers; + while (static_cast(readers.size()) < FLAGS_readers) { + readers.emplace_back(&shard_states, db_raw); + } + std::deque writers; + while (static_cast(writers.size()) < FLAGS_writers) { + writers.emplace_back(&shard_states, db_raw); + } + + // Each shard gets a random reader and random writer assigned to it + for (int i = 1; i <= FLAGS_shards; ++i) { + std::uniform_int_distribution reader_dist(0, FLAGS_readers - 1); + std::uniform_int_distribution writer_dist(0, FLAGS_writers - 1); + shard_states[i].reader = &readers[reader_dist(rng)]; + shard_states[i].writer = &writers[writer_dist(rng)]; + } + + StatsThread stats_thread(db_raw); + for (Writer& w : writers) { + w.start(); + } + + writers.clear(); + readers.clear(); +} +#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE) diff --git a/external/rocksdb/db/inlineskiplist.h b/external/rocksdb/db/inlineskiplist.h new file mode 100644 index 0000000000..cfd47f39f4 --- /dev/null +++ b/external/rocksdb/db/inlineskiplist.h @@ -0,0 +1,657 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional +// grant of patent rights can be found in the PATENTS file in the same +// directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. Use of +// this source code is governed by a BSD-style license that can be found +// in the LICENSE file. See the AUTHORS file for names of contributors. +// +// InlineSkipList is derived from SkipList (skiplist.h), but it optimizes +// the memory layout by requiring that the key storage be allocated through +// the skip list instance. For the common case of SkipList this saves 1 pointer per skip list node and gives better cache +// locality, at the expense of wasted padding from using AllocateAligned +// instead of Allocate for the keys. The unused padding will be from +// 0 to sizeof(void*)-1 bytes, and the space savings are sizeof(void*) +// bytes, so despite the padding the space used is always less than +// SkipList. +// +// Thread safety ------------- +// +// Writes via Insert require external synchronization, most likely a mutex. +// InsertConcurrently can be safely called concurrently with reads and +// with other concurrent inserts. Reads require a guarantee that the +// InlineSkipList will not be destroyed while the read is in progress. +// Apart from that, reads progress without any internal locking or +// synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the InlineSkipList is +// destroyed. This is trivially guaranteed by the code since we never +// delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the InlineSkipList. +// Only Insert() modifies the list, and it is careful to initialize a +// node and use release-stores to publish the nodes in one or more lists. +// +// ... prev vs. next pointer ordering ... +// + +#pragma once +#include +#include +#include +#include "port/port.h" +#include "util/allocator.h" +#include "util/random.h" + +namespace rocksdb { + +template +class InlineSkipList { + private: + struct Node; + + public: + // Create a new InlineSkipList object that will use "cmp" for comparing + // keys, and will allocate memory using "*allocator". Objects allocated + // in the allocator must remain allocated for the lifetime of the + // skiplist object. + explicit InlineSkipList(Comparator cmp, Allocator* allocator, + int32_t max_height = 12, + int32_t branching_factor = 4); + + // Allocates a key and a skip-list node, returning a pointer to the key + // portion of the node. This method is thread-safe if the allocator + // is thread-safe. + char* AllocateKey(size_t key_size); + + // Inserts a key allocated by AllocateKey, after the actual key value + // has been filled in. + // + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls to INSERT + void Insert(const char* key); + + // Like Insert, but external synchronization is not required. + void InsertConcurrently(const char* key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const char* key) const; + + // Return estimated number of entries smaller than `key`. + uint64_t EstimateCount(const char* key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const InlineSkipList* list); + + // Change the underlying skiplist used for this iterator + // This enables us not changing the iterator without deallocating + // an old one and then allocating a new one + void SetList(const InlineSkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const char* key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const char* target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const InlineSkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + enum MaxPossibleHeightEnum : uint16_t { kMaxPossibleHeight = 32 }; + + const uint16_t kMaxHeight_; + const uint16_t kBranching_; + const uint32_t kScaledInverseBranching_; + + // Immutable after construction + Comparator const compare_; + Allocator* const allocator_; // Allocator used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + std::atomic max_height_; // Height of the entire list + + // Used for optimizing sequential insert patterns. Tricky. prev_height_ + // of zero means prev_ is undefined. Otherwise: prev_[i] for i up + // to max_height_ - 1 (inclusive) is the predecessor of prev_[0], and + // prev_height_ is the height of prev_[0]. prev_[0] can only be equal + // to head when max_height_ and prev_height_ are both 1. + Node** prev_; + std::atomic prev_height_; + + inline int GetMaxHeight() const { + return max_height_.load(std::memory_order_relaxed); + } + + int RandomHeight(); + + Node* AllocateNode(size_t key_size, int height); + + bool Equal(const char* a, const char* b) const { + return (compare_(a, b) == 0); + } + + // Return true if key is greater than the data stored in "n". Null n + // is considered infinite. + bool KeyIsAfterNode(const char* key, Node* n) const; + + // Returns the earliest node with a key >= key. + // Return nullptr if there is no such node. + Node* FindGreaterOrEqual(const char* key) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + // Fills prev[level] with pointer to previous node at "level" for every + // level in [0..max_height_-1], if prev is non-null. + Node* FindLessThan(const char* key, Node** prev = nullptr) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // Traverses a single level of the list, setting *out_prev to the last + // node before the key and *out_next to the first node after. Assumes + // that the key is not present in the skip list. On entry, before should + // point to a node that is before the key, and after should point to + // a node that is after the key. after should be nullptr if a good after + // node isn't conveniently available. + void FindLevelSplice(const char* key, Node* before, Node* after, int level, + Node** out_prev, Node** out_next); + + // No copying allowed + InlineSkipList(const InlineSkipList&); + InlineSkipList& operator=(const InlineSkipList&); +}; + +// Implementation details follow + +// The Node data type is more of a pointer into custom-managed memory than +// a traditional C++ struct. The key is stored in the bytes immediately +// after the struct, and the next_ pointers for nodes with height > 1 are +// stored immediately _before_ the struct. This avoids the need to include +// any pointer or sizing data, which reduces per-node memory overheads. +template +struct InlineSkipList::Node { + // Stores the height of the node in the memory location normally used for + // next_[0]. This is used for passing data from AllocateKey to Insert. + void StashHeight(const int height) { + assert(sizeof(int) <= sizeof(next_[0])); + memcpy(&next_[0], &height, sizeof(int)); + } + + // Retrieves the value passed to StashHeight. Undefined after a call + // to SetNext or NoBarrier_SetNext. + int UnstashHeight() const { + int rv; + memcpy(&rv, &next_[0], sizeof(int)); + return rv; + } + + const char* Key() const { return reinterpret_cast(&next_[1]); } + + // Accessors/mutators for links. Wrapped in methods so we can add + // the appropriate barriers as necessary, and perform the necessary + // addressing trickery for storing links below the Node in memory. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return (next_[-n].load(std::memory_order_acquire)); + } + + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[-n].store(x, std::memory_order_release); + } + + bool CASNext(int n, Node* expected, Node* x) { + assert(n >= 0); + return next_[-n].compare_exchange_strong(expected, x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return next_[-n].load(std::memory_order_relaxed); + } + + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[-n].store(x, std::memory_order_relaxed); + } + + private: + // next_[0] is the lowest level link (level 0). Higher levels are + // stored _earlier_, so level 1 is at next_[-1]. + std::atomic next_[1]; +}; + +template +inline InlineSkipList::Iterator::Iterator( + const InlineSkipList* list) { + SetList(list); +} + +template +inline void InlineSkipList::Iterator::SetList( + const InlineSkipList* list) { + list_ = list; + node_ = nullptr; +} + +template +inline bool InlineSkipList::Iterator::Valid() const { + return node_ != nullptr; +} + +template +inline const char* InlineSkipList::Iterator::key() const { + assert(Valid()); + return node_->Key(); +} + +template +inline void InlineSkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void InlineSkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->Key()); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +inline void InlineSkipList::Iterator::Seek(const char* target) { + node_ = list_->FindGreaterOrEqual(target); +} + +template +inline void InlineSkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void InlineSkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +int InlineSkipList::RandomHeight() { + auto rnd = Random::GetTLSInstance(); + + // Increase height with probability 1 in kBranching + int height = 1; + while (height < kMaxHeight_ && height < kMaxPossibleHeight && + rnd->Next() < kScaledInverseBranching_) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight_); + assert(height <= kMaxPossibleHeight); + return height; +} + +template +bool InlineSkipList::KeyIsAfterNode(const char* key, + Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->Key(), key) < 0); +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindGreaterOrEqual(const char* key) const { + // Note: It looks like we could reduce duplication by implementing + // this function as FindLessThan(key)->Next(0), but we wouldn't be able + // to exit early on equality and the result wouldn't even be correct. + // A concurrent insert might occur after FindLessThan(key) but before + // we get a chance to call Next(0). + Node* x = head_; + int level = GetMaxHeight() - 1; + Node* last_bigger = nullptr; + while (true) { + Node* next = x->Next(level); + // Make sure the lists are sorted + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); + // Make sure we haven't overshot during our search + assert(x == head_ || KeyIsAfterNode(key, x)); + int cmp = (next == nullptr || next == last_bigger) + ? 1 + : compare_(next->Key(), key); + if (cmp == 0 || (cmp > 0 && level == 0)) { + return next; + } else if (cmp < 0) { + // Keep searching in this list + x = next; + } else { + // Switch to next list, reuse compare_() result + last_bigger = next; + level--; + } + } +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindLessThan(const char* key, Node** prev) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + // KeyIsAfter(key, last_not_after) is definitely false + Node* last_not_after = nullptr; + while (true) { + Node* next = x->Next(level); + assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); + assert(x == head_ || KeyIsAfterNode(key, x)); + if (next != last_not_after && KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != nullptr) { + prev[level] = x; + } + if (level == 0) { + return x; + } else { + // Switch to next list, reuse KeyIUsAfterNode() result + last_not_after = next; + level--; + } + } + } +} + +template +typename InlineSkipList::Node* +InlineSkipList::FindLast() const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == nullptr) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +uint64_t InlineSkipList::EstimateCount(const char* key) const { + uint64_t count = 0; + + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->Key(), key) < 0); + Node* next = x->Next(level); + if (next == nullptr || compare_(next->Key(), key) >= 0) { + if (level == 0) { + return count; + } else { + // Switch to next list + count *= kBranching_; + level--; + } + } else { + x = next; + count++; + } + } +} + +template +InlineSkipList::InlineSkipList(const Comparator cmp, + Allocator* allocator, + int32_t max_height, + int32_t branching_factor) + : kMaxHeight_(max_height), + kBranching_(branching_factor), + kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), + compare_(cmp), + allocator_(allocator), + head_(AllocateNode(0, max_height)), + max_height_(1), + prev_height_(1) { + assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); + assert(branching_factor > 1 && + kBranching_ == static_cast(branching_factor)); + assert(kScaledInverseBranching_ > 0); + // Allocate the prev_ Node* array, directly from the passed-in allocator. + // prev_ does not need to be freed, as its life cycle is tied up with + // the allocator as a whole. + prev_ = reinterpret_cast( + allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); + for (int i = 0; i < kMaxHeight_; i++) { + head_->SetNext(i, nullptr); + prev_[i] = head_; + } +} + +template +char* InlineSkipList::AllocateKey(size_t key_size) { + return const_cast(AllocateNode(key_size, RandomHeight())->Key()); +} + +template +typename InlineSkipList::Node* +InlineSkipList::AllocateNode(size_t key_size, int height) { + auto prefix = sizeof(std::atomic) * (height - 1); + + // prefix is space for the height - 1 pointers that we store before + // the Node instance (next_[-(height - 1) .. -1]). Node starts at + // raw + prefix, and holds the bottom-mode (level 0) skip list pointer + // next_[0]. key_size is the bytes for the key, which comes just after + // the Node. + char* raw = allocator_->AllocateAligned(prefix + sizeof(Node) + key_size); + Node* x = reinterpret_cast(raw + prefix); + + // Once we've linked the node into the skip list we don't actually need + // to know its height, because we can implicitly use the fact that we + // traversed into a node at level h to known that h is a valid level + // for that node. We need to convey the height to the Insert step, + // however, so that it can perform the proper links. Since we're not + // using the pointers at the moment, StashHeight temporarily borrow + // storage from next_[0] for that purpose. + x->StashHeight(height); + return x; +} + +template +void InlineSkipList::Insert(const char* key) { + // InsertConcurrently often can't maintain the prev_ invariants, so + // it just sets prev_height_ to zero, letting us know that we should + // ignore it. A relaxed load suffices here because write thread + // synchronization separates Insert calls from InsertConcurrently calls. + auto prev_height = prev_height_.load(std::memory_order_relaxed); + + // fast path for sequential insertion + if (prev_height > 0 && !KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) && + (prev_[0] == head_ || KeyIsAfterNode(key, prev_[0]))) { + assert(prev_[0] != head_ || (prev_height == 1 && GetMaxHeight() == 1)); + + // Outside of this method prev_[1..max_height_] is the predecessor + // of prev_[0], and prev_height_ refers to prev_[0]. Inside Insert + // prev_[0..max_height - 1] is the predecessor of key. Switch from + // the external state to the internal + for (int i = 1; i < prev_height; i++) { + prev_[i] = prev_[0]; + } + } else { + // TODO(opt): we could use a NoBarrier predecessor search as an + // optimization for architectures where memory_order_acquire needs + // a synchronization instruction. Doesn't matter on x86 + FindLessThan(key, prev_); + } + + // Our data structure does not allow duplicate insertion + assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->Key())); + + // Find the Node that we placed before the key in AllocateKey + Node* x = reinterpret_cast(const_cast(key)) - 1; + int height = x->UnstashHeight(); + assert(height >= 1 && height <= kMaxHeight_); + + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev_[i] = head_; + } + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (nullptr), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since nullptr sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.store(height, std::memory_order_relaxed); + } + + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i)); + prev_[i]->SetNext(i, x); + } + prev_[0] = x; + prev_height_.store(height, std::memory_order_relaxed); +} + +template +void InlineSkipList::FindLevelSplice(const char* key, Node* before, + Node* after, int level, + Node** out_prev, + Node** out_next) { + while (true) { + Node* next = before->Next(level); + assert(before == head_ || next == nullptr || + KeyIsAfterNode(next->Key(), before)); + assert(before == head_ || KeyIsAfterNode(key, before)); + if (next == after || !KeyIsAfterNode(key, next)) { + // found it + *out_prev = before; + *out_next = next; + return; + } + before = next; + } +} + +template +void InlineSkipList::InsertConcurrently(const char* key) { + Node* x = reinterpret_cast(const_cast(key)) - 1; + int height = x->UnstashHeight(); + assert(height >= 1 && height <= kMaxHeight_); + + // We don't have a lock-free algorithm for updating prev_, but we do have + // the option of invalidating the entire sequential-insertion cache. + // prev_'s invariant is that prev_[i] (i > 0) is the predecessor of + // prev_[0] at that level. We're only going to violate that if height + // > 1 and key lands after prev_[height - 1] but before prev_[0]. + // Comparisons are pretty expensive, so an easier version is to just + // clear the cache if height > 1. We only write to prev_height_ if the + // nobody else has, to avoid invalidating the root of the skip list in + // all of the other CPU caches. + if (height > 1 && prev_height_.load(std::memory_order_relaxed) != 0) { + prev_height_.store(0, std::memory_order_relaxed); + } + + int max_height = max_height_.load(std::memory_order_relaxed); + while (height > max_height) { + if (max_height_.compare_exchange_strong(max_height, height)) { + // successfully updated it + max_height = height; + break; + } + // else retry, possibly exiting the loop because somebody else + // increased it + } + assert(max_height <= kMaxPossibleHeight); + + Node* prev[kMaxPossibleHeight + 1]; + Node* next[kMaxPossibleHeight + 1]; + prev[max_height] = head_; + next[max_height] = nullptr; + for (int i = max_height - 1; i >= 0; --i) { + FindLevelSplice(key, prev[i + 1], next[i + 1], i, &prev[i], &next[i]); + } + for (int i = 0; i < height; ++i) { + while (true) { + x->NoBarrier_SetNext(i, next[i]); + if (prev[i]->CASNext(i, next[i], x)) { + // success + break; + } + // CAS failed, we need to recompute prev and next. It is unlikely + // to be helpful to try to use a different level as we redo the + // search, because it should be unlikely that lots of nodes have + // been inserted between prev[i] and next[i]. No point in using + // next[i] as the after hint, because we know it is stale. + FindLevelSplice(key, prev[i], nullptr, i, &prev[i], &next[i]); + } + } +} + +template +bool InlineSkipList::Contains(const char* key) const { + Node* x = FindGreaterOrEqual(key); + if (x != nullptr && Equal(key, x->Key())) { + return true; + } else { + return false; + } +} + +} // namespace rocksdb diff --git a/external/rocksdb/db/inlineskiplist_test.cc b/external/rocksdb/db/inlineskiplist_test.cc new file mode 100644 index 0000000000..5743bacec6 --- /dev/null +++ b/external/rocksdb/db/inlineskiplist_test.cc @@ -0,0 +1,475 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/inlineskiplist.h" +#include +#include "rocksdb/env.h" +#include "util/concurrent_arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +// Our test skip list stores 8-byte unsigned integers +typedef uint64_t Key; + +static const char* Encode(const uint64_t* key) { + return reinterpret_cast(key); +} + +static Key Decode(const char* key) { + Key rv; + memcpy(&rv, key, sizeof(Key)); + return rv; +} + +struct TestComparator { + int operator()(const char* a, const char* b) const { + if (Decode(a) < Decode(b)) { + return -1; + } else if (Decode(a) > Decode(b)) { + return +1; + } else { + return 0; + } + } +}; + +class InlineSkipTest : public testing::Test {}; + +TEST_F(InlineSkipTest, Empty) { + Arena arena; + TestComparator cmp; + InlineSkipList list(cmp, &arena); + Key key = 10; + ASSERT_TRUE(!list.Contains(Encode(&key))); + + InlineSkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + key = 100; + iter.Seek(Encode(&key)); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST_F(InlineSkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + ConcurrentArena arena; + TestComparator cmp; + InlineSkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + char* buf = list.AllocateKey(sizeof(Key)); + memcpy(buf, &key, sizeof(Key)); + list.Insert(buf); + } + } + + for (Key i = 0; i < R; i++) { + if (list.Contains(Encode(&i))) { + ASSERT_EQ(keys.count(i), 1U); + } else { + ASSERT_EQ(keys.count(i), 0U); + } + } + + // Simple iterator tests + { + InlineSkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + uint64_t zero = 0; + iter.Seek(Encode(&zero)); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), Decode(iter.key())); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), Decode(iter.key())); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), Decode(iter.key())); + } + + // Forward iteration test + for (Key i = 0; i < R; i++) { + InlineSkipList::Iterator iter(&list); + iter.Seek(Encode(&i)); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, Decode(iter.key())); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + InlineSkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, Decode(iter.key())); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + public: + static const uint32_t K = 8; + + private: + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = {k, g}; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + std::atomic generation[K]; + void Set(int k, int v) { + generation[k].store(v, std::memory_order_release); + } + int Get(int k) { return generation[k].load(std::memory_order_acquire); } + + State() { + for (unsigned int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + ConcurrentArena arena_; + + // InlineSkipList is not protected by mu_. We just use a single writer + // thread to modify it. + InlineSkipList list_; + + public: + ConcurrentTest() : list_(TestComparator(), &arena_) {} + + // REQUIRES: No concurrent calls to WriteStep or ConcurrentWriteStep + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const int g = current_.Get(k) + 1; + const Key new_key = MakeKey(k, g); + char* buf = list_.AllocateKey(sizeof(Key)); + memcpy(buf, &new_key, sizeof(Key)); + list_.Insert(buf); + current_.Set(k, g); + } + + // REQUIRES: No concurrent calls for the same k + void ConcurrentWriteStep(uint32_t k) { + const int g = current_.Get(k) + 1; + const Key new_key = MakeKey(k, g); + char* buf = list_.AllocateKey(sizeof(Key)); + memcpy(buf, &new_key, sizeof(Key)); + list_.InsertConcurrently(buf); + ASSERT_EQ(g, current_.Get(k) + 1); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (unsigned int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + InlineSkipList::Iterator iter(&list_); + iter.Seek(Encode(&pos)); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = Decode(iter.key()); + ASSERT_TRUE(IsValidKey(current)) << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0U) || + (gen(pos) > static_cast(initial_state.Get( + static_cast(key(pos)))))) + << "key: " << key(pos) << "; gen: " << gen(pos) + << "; initgen: " << initial_state.Get(static_cast(key(pos))); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(Encode(&new_target)); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST_F(InlineSkipTest, ConcurrentReadWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +TEST_F(InlineSkipTest, ConcurrentInsertWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + uint32_t base = rnd.Next(); + for (int j = 0; j < 4; ++j) { + test.ConcurrentWriteStep((base + j) % ConcurrentTest::K); + } + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + std::atomic quit_flag_; + std::atomic next_writer_; + + enum ReaderState { STARTING, RUNNING, DONE }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(false), + state_(STARTING), + pending_writers_(0), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + void AdjustPendingWriters(int delta) { + mu_.Lock(); + pending_writers_ += delta; + if (pending_writers_ == 0) { + state_cv_.Signal(); + } + mu_.Unlock(); + } + + void WaitForPendingWriters() { + mu_.Lock(); + while (pending_writers_ != 0) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + int pending_writers_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.load(std::memory_order_acquire)) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void ConcurrentWriter(void* arg) { + TestState* state = reinterpret_cast(arg); + uint32_t k = state->next_writer_++ % ConcurrentTest::K; + state->t_.ConcurrentWriteStep(k); + state->AdjustPendingWriters(-1); +} + +static void RunConcurrentRead(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int k = 0; k < kSize; ++k) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.store(true, std::memory_order_release); + state.Wait(TestState::DONE); + } +} + +static void RunConcurrentInsert(int run, int write_parallelism = 4) { + Env::Default()->SetBackgroundThreads(1 + write_parallelism, + Env::Priority::LOW); + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int k = 0; k < kSize; k += write_parallelism) { + state.next_writer_ = rnd.Next(); + state.AdjustPendingWriters(write_parallelism); + for (int p = 0; p < write_parallelism; ++p) { + Env::Default()->Schedule(ConcurrentWriter, &state); + } + state.WaitForPendingWriters(); + } + state.quit_flag_.store(true, std::memory_order_release); + state.Wait(TestState::DONE); + } +} + +TEST_F(InlineSkipTest, ConcurrentRead1) { RunConcurrentRead(1); } +TEST_F(InlineSkipTest, ConcurrentRead2) { RunConcurrentRead(2); } +TEST_F(InlineSkipTest, ConcurrentRead3) { RunConcurrentRead(3); } +TEST_F(InlineSkipTest, ConcurrentRead4) { RunConcurrentRead(4); } +TEST_F(InlineSkipTest, ConcurrentRead5) { RunConcurrentRead(5); } +TEST_F(InlineSkipTest, ConcurrentInsert1) { RunConcurrentInsert(1); } +TEST_F(InlineSkipTest, ConcurrentInsert2) { RunConcurrentInsert(2); } +TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); } + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/internal_stats.cc b/external/rocksdb/db/internal_stats.cc index 4e37c1d08f..5170802374 100644 --- a/external/rocksdb/db/internal_stats.cc +++ b/external/rocksdb/db/internal_stats.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "db/column_family.h" @@ -35,8 +36,7 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) { "\n** Compaction Stats [%s] **\n" "Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) " "Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) " - "Comp(sec) Comp(cnt) Avg(sec) " - "Stall(cnt) KeyIn KeyDrop\n" + "Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop\n" "--------------------------------------------------------------------" "-----------------------------------------------------------" "--------------------------------------\n", @@ -44,9 +44,9 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) { } void PrintLevelStats(char* buf, size_t len, const std::string& name, - int num_files, int being_compacted, double total_file_size, double score, - double w_amp, uint64_t stalls, - const InternalStats::CompactionStats& stats) { + int num_files, int being_compacted, double total_file_size, + double score, double w_amp, + const InternalStats::CompactionStats& stats) { uint64_t bytes_read = stats.bytes_read_non_output_levels + stats.bytes_read_output_level; int64_t bytes_new = @@ -70,8 +70,6 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, "%9.0f " /* Comp(sec) */ "%9d " /* Comp(cnt) */ "%8.3f " /* Avg(sec) */ - "%10" PRIu64 - " " /* Stall(cnt) */ "%7s " /* KeyIn */ "%6s\n", /* KeyDrop */ name.c_str(), @@ -82,13 +80,29 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, bytes_read / kMB / elapsed, stats.bytes_written / kMB / elapsed, stats.micros / kMicrosInSec, stats.count, stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count, - stalls, num_input_records.c_str(), num_dropped_records.c_str()); + num_input_records.c_str(), num_dropped_records.c_str()); } + +// Assumes that trailing numbers represent an optional argument. This requires +// property names to not end with numbers. +std::pair GetPropertyNameAndArg(const Slice& property) { + Slice name = property, arg = property; + size_t sfx_len = 0; + while (sfx_len < property.size() && + isdigit(property[property.size() - sfx_len - 1])) { + ++sfx_len; + } + name.remove_suffix(sfx_len); + arg.remove_prefix(property.size() - sfx_len); + return {name, arg}; } +} // anonymous namespace static const std::string rocksdb_prefix = "rocksdb."; static const std::string num_files_at_level_prefix = "num-files-at-level"; +static const std::string compression_ratio_at_level_prefix = + "compression-ratio-at-level"; static const std::string allstats = "stats"; static const std::string sstables = "sstables"; static const std::string cfstats = "cfstats"; @@ -102,9 +116,8 @@ static const std::string compaction_pending = "compaction-pending"; static const std::string background_errors = "background-errors"; static const std::string cur_size_active_mem_table = "cur-size-active-mem-table"; -static const std::string cur_size_unflushed_mem_tables = - "cur-size-all-mem-tables"; -static const std::string cur_size_all_mem_tables = "size-all-mem-tables"; +static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables"; +static const std::string size_all_mem_tables = "size-all-mem-tables"; static const std::string num_entries_active_mem_table = "num-entries-active-mem-table"; static const std::string num_entries_imm_mem_tables = @@ -121,6 +134,8 @@ static const std::string is_file_deletions_enabled = static const std::string num_snapshots = "num-snapshots"; static const std::string oldest_snapshot_time = "oldest-snapshot-time"; static const std::string num_live_versions = "num-live-versions"; +static const std::string current_version_number = + "current-super-version-number"; static const std::string estimate_live_data_size = "estimate-live-data-size"; static const std::string base_level = "base-level"; static const std::string total_sst_files_size = "total-sst-files-size"; @@ -130,27 +145,38 @@ static const std::string aggregated_table_properties = "aggregated-table-properties"; static const std::string aggregated_table_properties_at_level = aggregated_table_properties + "-at-level"; +static const std::string num_running_compactions = "num-running-compactions"; +static const std::string num_running_flushes = "num-running-flushes"; const std::string DB::Properties::kNumFilesAtLevelPrefix = rocksdb_prefix + num_files_at_level_prefix; +const std::string DB::Properties::kCompressionRatioAtLevelPrefix = + rocksdb_prefix + compression_ratio_at_level_prefix; const std::string DB::Properties::kStats = rocksdb_prefix + allstats; const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables; const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats; const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; +const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; const std::string DB::Properties::kNumImmutableMemTable = rocksdb_prefix + num_immutable_mem_table; +const std::string DB::Properties::kNumImmutableMemTableFlushed = + rocksdb_prefix + num_immutable_mem_table_flushed; const std::string DB::Properties::kMemTableFlushPending = rocksdb_prefix + mem_table_flush_pending; const std::string DB::Properties::kCompactionPending = rocksdb_prefix + compaction_pending; +const std::string DB::Properties::kNumRunningCompactions = + rocksdb_prefix + num_running_compactions; +const std::string DB::Properties::kNumRunningFlushes = + rocksdb_prefix + num_running_flushes; const std::string DB::Properties::kBackgroundErrors = rocksdb_prefix + background_errors; const std::string DB::Properties::kCurSizeActiveMemTable = rocksdb_prefix + cur_size_active_mem_table; const std::string DB::Properties::kCurSizeAllMemTables = - rocksdb_prefix + cur_size_unflushed_mem_tables; -const std::string DB::Properties::kSizeAllMemTables = rocksdb_prefix + cur_size_all_mem_tables; +const std::string DB::Properties::kSizeAllMemTables = + rocksdb_prefix + size_all_mem_tables; const std::string DB::Properties::kNumEntriesActiveMemTable = rocksdb_prefix + num_entries_active_mem_table; const std::string DB::Properties::kNumEntriesImmMemTables = @@ -171,10 +197,13 @@ const std::string DB::Properties::kOldestSnapshotTime = rocksdb_prefix + oldest_snapshot_time; const std::string DB::Properties::kNumLiveVersions = rocksdb_prefix + num_live_versions; +const std::string DB::Properties::kCurrentSuperVersionNumber = + rocksdb_prefix + current_version_number; const std::string DB::Properties::kEstimateLiveDataSize = rocksdb_prefix + estimate_live_data_size; const std::string DB::Properties::kTotalSstFilesSize = rocksdb_prefix + total_sst_files_size; +const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level; const std::string DB::Properties::kEstimatePendingCompactionBytes = rocksdb_prefix + estimate_pending_comp_bytes; const std::string DB::Properties::kAggregatedTableProperties = @@ -182,284 +211,383 @@ const std::string DB::Properties::kAggregatedTableProperties = const std::string DB::Properties::kAggregatedTablePropertiesAtLevel = rocksdb_prefix + aggregated_table_properties_at_level; -DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, - bool* need_out_of_mutex) { - assert(is_int_property != nullptr); - assert(need_out_of_mutex != nullptr); - Slice in = property; - Slice prefix(rocksdb_prefix); - *need_out_of_mutex = false; - *is_int_property = false; - if (!in.starts_with(prefix)) { - return kUnknown; +const std::unordered_map InternalStats::ppt_name_to_info = { + {DB::Properties::kNumFilesAtLevelPrefix, + {false, &InternalStats::HandleNumFilesAtLevel, nullptr}}, + {DB::Properties::kCompressionRatioAtLevelPrefix, + {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr}}, + {DB::Properties::kLevelStats, + {false, &InternalStats::HandleLevelStats, nullptr}}, + {DB::Properties::kStats, {false, &InternalStats::HandleStats, nullptr}}, + {DB::Properties::kCFStats, {false, &InternalStats::HandleCFStats, nullptr}}, + {DB::Properties::kDBStats, {false, &InternalStats::HandleDBStats, nullptr}}, + {DB::Properties::kSSTables, + {false, &InternalStats::HandleSsTables, nullptr}}, + {DB::Properties::kAggregatedTableProperties, + {false, &InternalStats::HandleAggregatedTableProperties, nullptr}}, + {DB::Properties::kAggregatedTablePropertiesAtLevel, + {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, nullptr}}, + {DB::Properties::kNumImmutableMemTable, + {false, nullptr, &InternalStats::HandleNumImmutableMemTable}}, + {DB::Properties::kNumImmutableMemTableFlushed, + {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed}}, + {DB::Properties::kMemTableFlushPending, + {false, nullptr, &InternalStats::HandleMemTableFlushPending}}, + {DB::Properties::kCompactionPending, + {false, nullptr, &InternalStats::HandleCompactionPending}}, + {DB::Properties::kBackgroundErrors, + {false, nullptr, &InternalStats::HandleBackgroundErrors}}, + {DB::Properties::kCurSizeActiveMemTable, + {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable}}, + {DB::Properties::kCurSizeAllMemTables, + {false, nullptr, &InternalStats::HandleCurSizeAllMemTables}}, + {DB::Properties::kSizeAllMemTables, + {false, nullptr, &InternalStats::HandleSizeAllMemTables}}, + {DB::Properties::kNumEntriesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable}}, + {DB::Properties::kNumEntriesImmMemTables, + {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables}}, + {DB::Properties::kNumDeletesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable}}, + {DB::Properties::kNumDeletesImmMemTables, + {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables}}, + {DB::Properties::kEstimateNumKeys, + {false, nullptr, &InternalStats::HandleEstimateNumKeys}}, + {DB::Properties::kEstimateTableReadersMem, + {true, nullptr, &InternalStats::HandleEstimateTableReadersMem}}, + {DB::Properties::kIsFileDeletionsEnabled, + {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled}}, + {DB::Properties::kNumSnapshots, + {false, nullptr, &InternalStats::HandleNumSnapshots}}, + {DB::Properties::kOldestSnapshotTime, + {false, nullptr, &InternalStats::HandleOldestSnapshotTime}}, + {DB::Properties::kNumLiveVersions, + {false, nullptr, &InternalStats::HandleNumLiveVersions}}, + {DB::Properties::kCurrentSuperVersionNumber, + {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber}}, + {DB::Properties::kEstimateLiveDataSize, + {true, nullptr, &InternalStats::HandleEstimateLiveDataSize}}, + {DB::Properties::kBaseLevel, + {false, nullptr, &InternalStats::HandleBaseLevel}}, + {DB::Properties::kTotalSstFilesSize, + {false, nullptr, &InternalStats::HandleTotalSstFilesSize}}, + {DB::Properties::kEstimatePendingCompactionBytes, + {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes}}, + {DB::Properties::kNumRunningFlushes, + {false, nullptr, &InternalStats::HandleNumRunningFlushes}}, + {DB::Properties::kNumRunningCompactions, + {false, nullptr, &InternalStats::HandleNumRunningCompactions}}, +}; + +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { + std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); + auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); + if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) { + return nullptr; } - in.remove_prefix(prefix.size()); - - if (in.starts_with(num_files_at_level_prefix)) { - return kNumFilesAtLevel; - } else if (in == levelstats) { - return kLevelStats; - } else if (in == allstats) { - return kStats; - } else if (in == cfstats) { - return kCFStats; - } else if (in == dbstats) { - return kDBStats; - } else if (in == sstables) { - return kSsTables; - } else if (in == aggregated_table_properties) { - return kAggregatedTableProperties; - } else if (in.starts_with(aggregated_table_properties_at_level)) { - return kAggregatedTablePropertiesAtLevel; + return &ppt_info_iter->second; +} + +bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::string* value) { + assert(value != nullptr); + assert(property_info.handle_string != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_string))(value, arg); +} + +bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, + uint64_t* value, DBImpl* db) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + !property_info.need_out_of_mutex); + db->mutex_.AssertHeld(); + return (this->*(property_info.handle_int))(value, db, nullptr /* version */); +} + +bool InternalStats::GetIntPropertyOutOfMutex( + const DBPropertyInfo& property_info, Version* version, uint64_t* value) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + property_info.need_out_of_mutex); + return (this->*(property_info.handle_int))(value, nullptr /* db */, version); +} + +bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + vstorage->NumLevelFiles(static_cast(level))); + *value = buf; + return true; } +} - *is_int_property = true; - if (in == num_immutable_mem_table) { - return kNumImmutableMemTable; - } else if (in == num_immutable_mem_table_flushed) { - return kNumImmutableMemTableFlushed; - } else if (in == mem_table_flush_pending) { - return kMemtableFlushPending; - } else if (in == compaction_pending) { - return kCompactionPending; - } else if (in == background_errors) { - return kBackgroundErrors; - } else if (in == cur_size_active_mem_table) { - return kCurSizeActiveMemTable; - } else if (in == cur_size_unflushed_mem_tables) { - return kCurSizeAllMemTables; - } else if (in == cur_size_all_mem_tables) { - return kSizeAllMemTables; - } else if (in == num_entries_active_mem_table) { - return kNumEntriesInMutableMemtable; - } else if (in == num_entries_imm_mem_tables) { - return kNumEntriesInImmutableMemtable; - } else if (in == num_deletes_active_mem_table) { - return kNumDeletesInMutableMemtable; - } else if (in == num_deletes_imm_mem_tables) { - return kNumDeletesInImmutableMemtable; - } else if (in == estimate_num_keys) { - return kEstimatedNumKeys; - } else if (in == estimate_table_readers_mem) { - *need_out_of_mutex = true; - return kEstimatedUsageByTableReaders; - } else if (in == is_file_deletions_enabled) { - return kIsFileDeletionEnabled; - } else if (in == num_snapshots) { - return kNumSnapshots; - } else if (in == oldest_snapshot_time) { - return kOldestSnapshotTime; - } else if (in == num_live_versions) { - return kNumLiveVersions; - } else if (in == estimate_live_data_size) { - *need_out_of_mutex = true; - return kEstimateLiveDataSize; - } else if (in == base_level) { - return kBaseLevel; - } else if (in == total_sst_files_size) { - return kTotalSstFilesSize; - } else if (in == estimate_pending_comp_bytes) { - return kEstimatePendingCompactionBytes; +bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value, + Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || level >= static_cast(number_levels_)) { + return false; } - return kUnknown; + *value = ToString( + vstorage->GetEstimatedCompressionRatioAtLevel(static_cast(level))); + return true; } -bool InternalStats::GetIntPropertyOutOfMutex(DBPropertyType property_type, - Version* version, - uint64_t* value) const { - assert(value != nullptr); +bool InternalStats::HandleLevelStats(std::string* value, Slice suffix) { + char buf[1000]; const auto* vstorage = cfd_->current()->storage_info(); + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); - switch (property_type) { - case kEstimatedUsageByTableReaders: - *value = (version == nullptr) ? - 0 : version->GetMemoryUsageByTableReaders(); - return true; - case kEstimateLiveDataSize: - *value = vstorage->EstimateLiveDataSize(); - return true; - default: - return false; + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + vstorage->NumLevelFiles(level), + vstorage->NumLevelBytes(level) / kMB); + value->append(buf); } + return true; } -bool InternalStats::GetStringProperty(DBPropertyType property_type, - const Slice& property, - std::string* value) { - assert(value != nullptr); +bool InternalStats::HandleStats(std::string* value, Slice suffix) { + if (!HandleCFStats(value, suffix)) { + return false; + } + if (!HandleDBStats(value, suffix)) { + return false; + } + return true; +} + +bool InternalStats::HandleCFStats(std::string* value, Slice suffix) { + DumpCFStats(value); + return true; +} + +bool InternalStats::HandleDBStats(std::string* value, Slice suffix) { + DumpDBStats(value); + return true; +} + +bool InternalStats::HandleSsTables(std::string* value, Slice suffix) { auto* current = cfd_->current(); - const auto* vstorage = current->storage_info(); - Slice in = property; - - switch (property_type) { - case kNumFilesAtLevel: { - in.remove_prefix(strlen("rocksdb.num-files-at-level")); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || (int)level >= number_levels_) { - return false; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", - vstorage->NumLevelFiles(static_cast(level))); - *value = buf; - return true; - } - } - case kLevelStats: { - char buf[1000]; - snprintf(buf, sizeof(buf), - "Level Files Size(MB)\n" - "--------------------\n"); - value->append(buf); + *value = current->DebugString(); + return true; +} - for (int level = 0; level < number_levels_; level++) { - snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, - vstorage->NumLevelFiles(level), - vstorage->NumLevelBytes(level) / kMB); - value->append(buf); - } - return true; - } - case kStats: { - if (!GetStringProperty(kCFStats, DB::Properties::kCFStats, value)) { - return false; - } - if (!GetStringProperty(kDBStats, DB::Properties::kDBStats, value)) { - return false; - } - return true; - } - case kCFStats: { - DumpCFStats(value); - return true; - } - case kDBStats: { - DumpDBStats(value); - return true; - } - case kSsTables: - *value = current->DebugString(); - return true; - case kAggregatedTableProperties: { - std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); - if (!s.ok()) { - return false; - } - *value = tp->ToString(); - return true; - } - case kAggregatedTablePropertiesAtLevel: { - in.remove_prefix( - DB::Properties::kAggregatedTablePropertiesAtLevel.length()); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || static_cast(level) >= number_levels_) { - return false; - } - std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); - if (!s.ok()) { - return false; - } - *value = tp->ToString(); - return true; - } - default: - return false; +bool InternalStats::HandleAggregatedTableProperties(std::string* value, + Slice suffix) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; } + *value = tp->ToString(); + return true; } -bool InternalStats::GetIntProperty(DBPropertyType property_type, - uint64_t* value, DBImpl* db) const { - db->mutex_.AssertHeld(); +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, + Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *value = tp->ToString(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->imm()->NumNotFlushed(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value, + DBImpl* db, + Version* version) { + *value = cfd_->imm()->NumFlushed(); + return true; +} + +bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version) { + // Return number of mem tables that are ready to flush (made immutable) + *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->num_running_flushes(); + return true; +} + +bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* db, + Version* version) { + // 1 if the system already determines at least one compaction is needed. + // 0 otherwise, const auto* vstorage = cfd_->current()->storage_info(); + *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); + return true; +} - switch (property_type) { - case kNumImmutableMemTable: - *value = cfd_->imm()->NumNotFlushed(); - return true; - case kNumImmutableMemTableFlushed: - *value = cfd_->imm()->NumFlushed(); - return true; - case kMemtableFlushPending: - // Return number of mem tables that are ready to flush (made immutable) - *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); - return true; - case kCompactionPending: - // 1 if the system already determines at least one compaction is needed. - // 0 otherwise, - *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); - return true; - case kBackgroundErrors: - // Accumulated number of errors in background flushes or compactions. - *value = GetBackgroundErrorCount(); - return true; - case kCurSizeActiveMemTable: - // Current size of the active memtable - *value = cfd_->mem()->ApproximateMemoryUsage(); - return true; - case kCurSizeAllMemTables: - // Current size of the active memtable + immutable memtables - *value = cfd_->mem()->ApproximateMemoryUsage() + - cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); - return true; - case kSizeAllMemTables: - *value = cfd_->mem()->ApproximateMemoryUsage() + - cfd_->imm()->ApproximateMemoryUsage(); - return true; - case kNumEntriesInMutableMemtable: - // Current number of entires in the active memtable - *value = cfd_->mem()->num_entries(); - return true; - case kNumEntriesInImmutableMemtable: - // Current number of entries in the immutable memtables - *value = cfd_->imm()->current()->GetTotalNumEntries(); - return true; - case kNumDeletesInMutableMemtable: - // Current number of entires in the active memtable - *value = cfd_->mem()->num_deletes(); - return true; - case kNumDeletesInImmutableMemtable: - // Current number of entries in the immutable memtables - *value = cfd_->imm()->current()->GetTotalNumDeletes(); - return true; - case kEstimatedNumKeys: - // Estimate number of entries in the column family: - // Use estimated entries in tables + total entries in memtables. - *value = cfd_->mem()->num_entries() + - cfd_->imm()->current()->GetTotalNumEntries() - - (cfd_->mem()->num_deletes() + - cfd_->imm()->current()->GetTotalNumDeletes()) * - 2 + - vstorage->GetEstimatedActiveKeys(); - return true; - case kNumSnapshots: - *value = db->snapshots().count(); - return true; - case kOldestSnapshotTime: - *value = static_cast(db->snapshots().GetOldestSnapshotTime()); - return true; - case kNumLiveVersions: - *value = cfd_->GetNumLiveVersions(); - return true; - case kIsFileDeletionEnabled: - *value = db->IsFileDeletionsEnabled(); - return true; - case kBaseLevel: - *value = vstorage->base_level(); - return true; - case kTotalSstFilesSize: - *value = cfd_->GetTotalSstFilesSize(); - return true; - case kEstimatePendingCompactionBytes: - *value = vstorage->estimated_compaction_needed_bytes(); - return true; - default: - return false; - } +bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->num_running_compactions_; + return true; +} + +bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* db, + Version* version) { + // Accumulated number of errors in background flushes or compactions. + *value = GetBackgroundErrorCount(); + return true; +} + +bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version) { + // Current size of the active memtable + *value = cfd_->mem()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, + Version* version) { + // Current size of the active memtable + immutable memtables + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); + return true; +} + +bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_entries(); + return true; +} + +bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumEntries(); + return true; +} + +bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_deletes(); + return true; +} + +bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumDeletes(); + return true; +} + +bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* db, + Version* version) { + // Estimate number of entries in the column family: + // Use estimated entries in tables + total entries in memtables. + const auto* vstorage = cfd_->current()->storage_info(); + *value = cfd_->mem()->num_entries() + + cfd_->imm()->current()->GetTotalNumEntries() - + (cfd_->mem()->num_deletes() + + cfd_->imm()->current()->GetTotalNumDeletes()) * + 2 + + vstorage->GetEstimatedActiveKeys(); + return true; +} + +bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->snapshots().count(); + return true; +} + +bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, + Version* version) { + *value = static_cast(db->snapshots().GetOldestSnapshotTime()); + return true; +} + +bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->GetNumLiveVersions(); + return true; +} + +bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->GetSuperVersionNumber(); + return true; +} + +bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->IsFileDeletionsEnabled(); + return true; +} + +bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* db, + Version* version) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->base_level(); + return true; +} + +bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->GetTotalSstFilesSize(); + return true; +} + +bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, + DBImpl* db, + Version* version) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->estimated_compaction_needed_bytes(); + return true; +} + +bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version) { + *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + return true; +} + +bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->EstimateLiveDataSize(); + return true; } void InternalStats::DumpDBStats(std::string* value) { @@ -472,17 +600,14 @@ void InternalStats::DumpDBStats(std::string* value) { seconds_up, interval_seconds_up); value->append(buf); // Cumulative - uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN]; - uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN]; - uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER]; - uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF]; - uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES]; - uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED]; - uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL]; - uint64_t write_stall_micros = db_stats_[InternalStats::WRITE_STALL_MICROS]; - uint64_t compact_bytes_read = 0; - uint64_t compact_bytes_write = 0; - uint64_t compact_micros = 0; + uint64_t user_bytes_written = GetDBStats(InternalStats::BYTES_WRITTEN); + uint64_t num_keys_written = GetDBStats(InternalStats::NUMBER_KEYS_WRITTEN); + uint64_t write_other = GetDBStats(InternalStats::WRITE_DONE_BY_OTHER); + uint64_t write_self = GetDBStats(InternalStats::WRITE_DONE_BY_SELF); + uint64_t wal_bytes = GetDBStats(InternalStats::WAL_FILE_BYTES); + uint64_t wal_synced = GetDBStats(InternalStats::WAL_FILE_SYNCED); + uint64_t write_with_wal = GetDBStats(InternalStats::WRITE_WITH_WAL); + uint64_t write_stall_micros = GetDBStats(InternalStats::WRITE_STALL_MICROS); const int kHumanMicrosLen = 32; char human_micros[kHumanMicrosLen]; @@ -490,15 +615,15 @@ void InternalStats::DumpDBStats(std::string* value) { // Data // writes: total number of write requests. // keys: total number of key updates issued by all the write requests - // batches: number of group commits issued to the DB. Each group can contain - // one or more writes. + // commit groups: number of group commits issued to the DB. Each group can + // contain one or more writes. // so writes/keys is the average number of put in multi-put or put - // writes/batches is the average group commit size. + // writes/groups is the average group commit size. // // The format is the same for interval stats. snprintf(buf, sizeof(buf), - "Cumulative writes: %s writes, %s keys, %s batches, " - "%.1f writes per batch, ingest: %.2f GB, %.2f MB/s\n", + "Cumulative writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n", NumberToHumanString(write_other + write_self).c_str(), NumberToHumanString(num_keys_written).c_str(), NumberToHumanString(write_self).c_str(), @@ -514,20 +639,6 @@ void InternalStats::DumpDBStats(std::string* value) { write_with_wal / static_cast(wal_synced + 1), wal_bytes / kGB, wal_bytes / kMB / seconds_up); value->append(buf); - // Compact - for (int level = 0; level < number_levels_; level++) { - compact_bytes_read += comp_stats_[level].bytes_read_output_level + - comp_stats_[level].bytes_read_non_output_levels; - compact_bytes_write += comp_stats_[level].bytes_written; - compact_micros += comp_stats_[level].micros; - } - snprintf(buf, sizeof(buf), - "Cumulative compaction: %.2f GB write, %.2f MB/s write, " - "%.2f GB read, %.2f MB/s read, %.1f seconds\n", - compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up, - compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up, - compact_micros / kMicrosInSec); - value->append(buf); // Stall AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); snprintf(buf, sizeof(buf), @@ -543,8 +654,8 @@ void InternalStats::DumpDBStats(std::string* value) { uint64_t interval_num_keys_written = num_keys_written - db_stats_snapshot_.num_keys_written; snprintf(buf, sizeof(buf), - "Interval writes: %s writes, %s keys, %s batches, " - "%.1f writes per batch, ingest: %.2f MB, %.2f MB/s\n", + "Interval writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n", NumberToHumanString( interval_write_other + interval_write_self).c_str(), NumberToHumanString(interval_num_keys_written).c_str(), @@ -572,25 +683,6 @@ void InternalStats::DumpDBStats(std::string* value) { interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); value->append(buf); - // Compaction - uint64_t interval_compact_bytes_write = - compact_bytes_write - db_stats_snapshot_.compact_bytes_write; - uint64_t interval_compact_bytes_read = - compact_bytes_read - db_stats_snapshot_.compact_bytes_read; - uint64_t interval_compact_micros = - compact_micros - db_stats_snapshot_.compact_micros; - - snprintf( - buf, sizeof(buf), - "Interval compaction: %.2f GB write, %.2f MB/s write, " - "%.2f GB read, %.2f MB/s read, %.1f seconds\n", - interval_compact_bytes_write / kGB, - interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001), - interval_compact_bytes_read / kGB, - interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), - interval_compact_micros / kMicrosInSec); - value->append(buf); - // Stall AppendHumanMicros( write_stall_micros - db_stats_snapshot_.write_stall_micros, @@ -622,9 +714,6 @@ void InternalStats::DumpDBStats(std::string* value) { db_stats_snapshot_.wal_synced = wal_synced; db_stats_snapshot_.write_with_wal = write_with_wal; db_stats_snapshot_.write_stall_micros = write_stall_micros; - db_stats_snapshot_.compact_bytes_write = compact_bytes_write; - db_stats_snapshot_.compact_bytes_read = compact_bytes_read; - db_stats_snapshot_.compact_micros = compact_micros; } void InternalStats::DumpCFStats(std::string* value) { @@ -661,61 +750,107 @@ void InternalStats::DumpCFStats(std::string* value) { int total_files = 0; int total_files_being_compacted = 0; double total_file_size = 0; - uint64_t total_slowdown_count_soft = 0; - uint64_t total_slowdown_count_hard = 0; - uint64_t total_stall_count = 0; for (int level = 0; level < number_levels_; level++) { int files = vstorage->NumLevelFiles(level); total_files += files; total_files_being_compacted += files_being_compacted[level]; if (comp_stats_[level].micros > 0 || files > 0) { - uint64_t stalls = - level == 0 ? (cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL] + - cf_stats_count_[LEVEL0_NUM_FILES_TOTAL] + - cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT] + - cf_stats_count_[MEMTABLE_COMPACTION]) - : (stall_leveln_slowdown_count_soft_[level] + - stall_leveln_slowdown_count_hard_[level]); - stats_sum.Add(comp_stats_[level]); total_file_size += vstorage->NumLevelBytes(level); - total_stall_count += stalls; - total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level]; - total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level]; double w_amp = (comp_stats_[level].bytes_read_non_output_levels == 0) ? 0.0 : static_cast(comp_stats_[level].bytes_written) / comp_stats_[level].bytes_read_non_output_levels; PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files, files_being_compacted[level], - vstorage->NumLevelBytes(level), compaction_score[level], - w_amp, stalls, comp_stats_[level]); + static_cast(vstorage->NumLevelBytes(level)), + compaction_score[level], + w_amp, comp_stats_[level]); value->append(buf); } } - uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED]; + + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t curr_ingest = flush_ingest + add_file_ingest; // Cumulative summary double w_amp = stats_sum.bytes_written / static_cast(curr_ingest + 1); + uint64_t total_stall_count = + cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL] + + cf_stats_count_[LEVEL0_NUM_FILES_TOTAL] + + cf_stats_count_[SOFT_PENDING_COMPACTION_BYTES_LIMIT] + + cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT] + + cf_stats_count_[MEMTABLE_COMPACTION] + cf_stats_count_[MEMTABLE_SLOWDOWN]; // Stats summary across levels PrintLevelStats(buf, sizeof(buf), "Sum", total_files, - total_files_being_compacted, total_file_size, 0, w_amp, - total_stall_count, stats_sum); + total_files_being_compacted, total_file_size, 0, w_amp, + stats_sum); value->append(buf); // Interval summary + uint64_t interval_flush_ingest = + flush_ingest - cf_stats_snapshot_.ingest_bytes_flush; + uint64_t interval_add_file_inget = + add_file_ingest - cf_stats_snapshot_.ingest_bytes_add_file; uint64_t interval_ingest = - curr_ingest - cf_stats_snapshot_.ingest_bytes + 1; + interval_flush_ingest + interval_add_file_inget + 1; CompactionStats interval_stats(stats_sum); interval_stats.Subtract(cf_stats_snapshot_.comp_stats); w_amp = interval_stats.bytes_written / static_cast(interval_ingest); - PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, - w_amp, total_stall_count - cf_stats_snapshot_.stall_count, - interval_stats); + PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats); + value->append(buf); + + double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + + snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n", + flush_ingest / kGB, interval_flush_ingest / kGB); + snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n", + add_file_ingest / kGB, interval_add_file_inget / kGB); value->append(buf); + // Compact + uint64_t compact_bytes_read = 0; + uint64_t compact_bytes_write = 0; + uint64_t compact_micros = 0; + for (int level = 0; level < number_levels_; level++) { + compact_bytes_read += comp_stats_[level].bytes_read_output_level + + comp_stats_[level].bytes_read_non_output_levels; + compact_bytes_write += comp_stats_[level].bytes_written; + compact_micros += comp_stats_[level].micros; + } + snprintf(buf, sizeof(buf), - "Flush(GB): cumulative %.3f, interval %.3f\n", - curr_ingest / kGB, interval_ingest / kGB); + "Cumulative compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up, + compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up, + compact_micros / kMicrosInSec); + value->append(buf); + + // Compaction interval + uint64_t interval_compact_bytes_write = + compact_bytes_write - cf_stats_snapshot_.compact_bytes_write; + uint64_t interval_compact_bytes_read = + compact_bytes_read - cf_stats_snapshot_.compact_bytes_read; + uint64_t interval_compact_micros = + compact_micros - cf_stats_snapshot_.compact_micros; + + snprintf( + buf, sizeof(buf), + "Interval compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + interval_compact_bytes_write / kGB, + interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_bytes_read / kGB, + interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_micros / kMicrosInSec); value->append(buf); + cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; + cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; + cf_stats_snapshot_.compact_micros = compact_micros; snprintf(buf, sizeof(buf), "Stalls(count): %" PRIu64 " level0_slowdown, " @@ -726,22 +861,27 @@ void InternalStats::DumpCFStats(std::string* value) { "%" PRIu64 " level0_numfiles_with_compaction, " "%" PRIu64 - " pending_compaction_bytes, " + " stop for pending_compaction_bytes, " + "%" PRIu64 + " slowdown for pending_compaction_bytes, " "%" PRIu64 " memtable_compaction, " "%" PRIu64 - " leveln_slowdown_soft, " - "%" PRIu64 " leveln_slowdown_hard\n", + " memtable_slowdown, " + "interval %" PRIu64 " total count\n", cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL], cf_stats_count_[LEVEL0_SLOWDOWN_WITH_COMPACTION], cf_stats_count_[LEVEL0_NUM_FILES_TOTAL], cf_stats_count_[LEVEL0_NUM_FILES_WITH_COMPACTION], cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT], - cf_stats_count_[MEMTABLE_COMPACTION], total_slowdown_count_soft, - total_slowdown_count_hard); + cf_stats_count_[SOFT_PENDING_COMPACTION_BYTES_LIMIT], + cf_stats_count_[MEMTABLE_COMPACTION], + cf_stats_count_[MEMTABLE_SLOWDOWN], + total_stall_count - cf_stats_snapshot_.stall_count); value->append(buf); - cf_stats_snapshot_.ingest_bytes = curr_ingest; + cf_stats_snapshot_.ingest_bytes_flush = flush_ingest; + cf_stats_snapshot_.ingest_bytes_add_file = add_file_ingest; cf_stats_snapshot_.comp_stats = stats_sum; cf_stats_snapshot_.stall_count = total_stall_count; } @@ -749,10 +889,7 @@ void InternalStats::DumpCFStats(std::string* value) { #else -DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, - bool* need_out_of_mutex) { - return kUnknown; -} +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { return nullptr; } #endif // !ROCKSDB_LITE diff --git a/external/rocksdb/db/internal_stats.h b/external/rocksdb/db/internal_stats.h index eeb226e5ee..56e36b692f 100644 --- a/external/rocksdb/db/internal_stats.h +++ b/external/rocksdb/db/internal_stats.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,60 +21,29 @@ namespace rocksdb { class MemTableList; class DBImpl; -// IMPORTANT: If you add a new property here, also add it to the list in -// include/rocksdb/db.h -enum DBPropertyType : uint32_t { - kUnknown, - kNumFilesAtLevel, // Number of files at a specific level - kLevelStats, // Return number of files and total sizes of each level - kCFStats, // Return general statitistics of CF - kDBStats, // Return general statitistics of DB - kStats, // Return general statitistics of both DB and CF - kSsTables, // Return a human readable string of current SST files - kStartIntTypes, // ---- Dummy value to indicate the start of integer values - kNumImmutableMemTable, // Return number of immutable mem tables that - // have not been flushed. - kNumImmutableMemTableFlushed, // Return number of immutable mem tables - // in memory that have already been flushed - kMemtableFlushPending, // Return 1 if mem table flushing is pending, - // otherwise 0. - kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. - kBackgroundErrors, // Return accumulated background errors encountered. - kCurSizeActiveMemTable, // Return current size of the active memtable - kCurSizeAllMemTables, // Return current size of unflushed - // (active + immutable) memtables - kSizeAllMemTables, // Return current size of all (active + immutable - // + pinned) memtables - kNumEntriesInMutableMemtable, // Return number of deletes in the mutable - // memtable. - kNumEntriesInImmutableMemtable, // Return sum of number of entries in all - // the immutable mem tables. - kNumDeletesInMutableMemtable, // Return number of entries in the mutable - // memtable. - kNumDeletesInImmutableMemtable, // Return sum of number of deletes in all - // the immutable mem tables. - kEstimatedNumKeys, // Estimated total number of keys in the database. - kEstimatedUsageByTableReaders, // Estimated memory by table readers. - kIsFileDeletionEnabled, // Equals disable_delete_obsolete_files_, - // 0 means file deletions enabled - kNumSnapshots, // Number of snapshots in the system - kOldestSnapshotTime, // Unix timestamp of the first snapshot - kNumLiveVersions, - kEstimateLiveDataSize, // Estimated amount of live data in bytes - kTotalSstFilesSize, // Total size of all sst files. - kBaseLevel, // The level that L0 data is compacted to - kEstimatePendingCompactionBytes, // Estimated bytes to compaction - kAggregatedTableProperties, // Return a string that contains the aggregated - // table properties. - kAggregatedTablePropertiesAtLevel, // Return a string that contains the - // aggregated - // table properties at the specified level. +// Config for retrieving a property's value. +struct DBPropertyInfo { + bool need_out_of_mutex; + + // gcc had an internal error for initializing union of pointer-to-member- + // functions. Workaround is to populate exactly one of the following function + // pointers with a non-nullptr value. + + // @param value Value-result argument for storing the property's string value + // @param suffix Argument portion of the property. For example, suffix would + // be "5" for the property "rocksdb.num-files-at-level5". So far, only + // certain string properties take an argument. + bool (InternalStats::*handle_string)(std::string* value, Slice suffix); + + // @param value Value-result argument for storing the property's uint64 value + // @param db Many of the int properties rely on DBImpl methods. + // @param version Version is needed in case the property is retrieved without + // holding db mutex, which is only supported for int properties. + bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db, + Version* version); }; -extern DBPropertyType GetPropertyType(const Slice& property, - bool* is_int_property, - bool* need_out_of_mutex); - +extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); #ifndef ROCKSDB_LITE class InternalStats { @@ -83,11 +52,14 @@ class InternalStats { LEVEL0_SLOWDOWN_TOTAL, LEVEL0_SLOWDOWN_WITH_COMPACTION, MEMTABLE_COMPACTION, + MEMTABLE_SLOWDOWN, LEVEL0_NUM_FILES_TOTAL, LEVEL0_NUM_FILES_WITH_COMPACTION, + SOFT_PENDING_COMPACTION_BYTES_LIMIT, HARD_PENDING_COMPACTION_BYTES_LIMIT, WRITE_STALLS_ENUM_MAX, BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, INTERNAL_CF_STATS_ENUM_MAX, }; @@ -104,30 +76,16 @@ class InternalStats { }; InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) - : db_stats_(INTERNAL_DB_STATS_ENUM_MAX), - cf_stats_value_(INTERNAL_CF_STATS_ENUM_MAX), - cf_stats_count_(INTERNAL_CF_STATS_ENUM_MAX), + : db_stats_{}, + cf_stats_value_{}, + cf_stats_count_{}, comp_stats_(num_levels), - stall_leveln_slowdown_count_hard_(num_levels), - stall_leveln_slowdown_count_soft_(num_levels), file_read_latency_(num_levels), bg_error_count_(0), number_levels_(num_levels), env_(env), cfd_(cfd), - started_at_(env->NowMicros()) { - for (int i = 0; i< INTERNAL_DB_STATS_ENUM_MAX; ++i) { - db_stats_[i] = 0; - } - for (int i = 0; i< INTERNAL_CF_STATS_ENUM_MAX; ++i) { - cf_stats_value_[i] = 0; - cf_stats_count_[i] = 0; - } - for (int i = 0; i < num_levels; ++i) { - stall_leveln_slowdown_count_hard_[i] = 0; - stall_leveln_slowdown_count_soft_[i] = 0; - } - } + started_at_(env->NowMicros()) {} // Per level compaction stats. comp_stats_[level] stores the stats for // compactions that produced data for the specified "level". @@ -234,21 +192,19 @@ class InternalStats { comp_stats_[level].bytes_moved += amount; } - void RecordLevelNSlowdown(int level, bool soft) { - if (soft) { - ++stall_leveln_slowdown_count_soft_[level]; - } else { - ++stall_leveln_slowdown_count_hard_[level]; - } - } - void AddCFStats(InternalCFStatsType type, uint64_t value) { cf_stats_value_[type] += value; ++cf_stats_count_[type]; } void AddDBStats(InternalDBStatsType type, uint64_t value) { - db_stats_[type] += value; + auto& v = db_stats_[type]; + v.store(v.load(std::memory_order_relaxed) + value, + std::memory_order_relaxed); + } + + uint64_t GetDBStats(InternalDBStatsType type) { + return db_stats_[type].load(std::memory_order_relaxed); } HistogramImpl* GetFileReadHist(int level) { @@ -259,42 +215,54 @@ class InternalStats { uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } - bool GetStringProperty(DBPropertyType property_type, const Slice& property, - std::string* value); + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value); - bool GetIntProperty(DBPropertyType property_type, uint64_t* value, - DBImpl* db) const; + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db); - bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, - uint64_t* value) const; + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value); + + // Store a mapping from the user-facing DB::Properties string to our + // DBPropertyInfo struct used internally for retrieving properties. + static const std::unordered_map ppt_name_to_info; private: void DumpDBStats(std::string* value); void DumpCFStats(std::string* value); // Per-DB stats - std::vector db_stats_; + std::atomic db_stats_[INTERNAL_DB_STATS_ENUM_MAX]; // Per-ColumnFamily stats - std::vector cf_stats_value_; - std::vector cf_stats_count_; + uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; + uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; // Per-ColumnFamily/level compaction stats std::vector comp_stats_; - // These count the number of microseconds for which MakeRoomForWrite stalls. - std::vector stall_leveln_slowdown_count_hard_; - std::vector stall_leveln_slowdown_count_soft_; std::vector file_read_latency_; // Used to compute per-interval statistics struct CFStatsSnapshot { // ColumnFamily-level stats CompactionStats comp_stats; - uint64_t ingest_bytes; // Bytes written to L0 + uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) + uint64_t ingest_bytes_add_file; // Bytes written to L0 (AddFile) uint64_t stall_count; // Stall count + // Stats from compaction jobs - bytes written, bytes read, duration. + uint64_t compact_bytes_write; + uint64_t compact_bytes_read; + uint64_t compact_micros; + double seconds_up; CFStatsSnapshot() : comp_stats(0), - ingest_bytes(0), - stall_count(0) {} + ingest_bytes_flush(0), + ingest_bytes_add_file(0), + stall_count(0), + compact_bytes_write(0), + compact_bytes_read(0), + compact_micros(0), + seconds_up(0) {} } cf_stats_snapshot_; struct DBStatsSnapshot { @@ -307,10 +275,6 @@ class InternalStats { // another thread. uint64_t write_other; uint64_t write_self; - // Stats from compaction jobs - bytes written, bytes read, duration. - uint64_t compact_bytes_write; - uint64_t compact_bytes_read; - uint64_t compact_micros; // Total number of keys written. write_self and write_other measure number // of write requests written, Each of the write request can contain updates // to multiple keys. num_keys_written is total number of keys updated by all @@ -327,14 +291,62 @@ class InternalStats { write_with_wal(0), write_other(0), write_self(0), - compact_bytes_write(0), - compact_bytes_read(0), - compact_micros(0), num_keys_written(0), write_stall_micros(0), seconds_up(0) {} } db_stats_snapshot_; + // Handler functions for getting property values. They use "value" as a value- + // result argument, and return true upon successfully setting "value". + bool HandleNumFilesAtLevel(std::string* value, Slice suffix); + bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix); + bool HandleLevelStats(std::string* value, Slice suffix); + bool HandleStats(std::string* value, Slice suffix); + bool HandleCFStats(std::string* value, Slice suffix); + bool HandleDBStats(std::string* value, Slice suffix); + bool HandleSsTables(std::string* value, Slice suffix); + bool HandleAggregatedTableProperties(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version); + bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version); + bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version); + // Total number of background errors encountered. Every time a flush task // or compaction task fails, this counter is incremented. The failure can // be caused by any possible reason, including file system errors, out of @@ -356,11 +368,14 @@ class InternalStats { LEVEL0_SLOWDOWN_TOTAL, LEVEL0_SLOWDOWN_WITH_COMPACTION, MEMTABLE_COMPACTION, + MEMTABLE_SLOWDOWN, LEVEL0_NUM_FILES_TOTAL, LEVEL0_NUM_FILES_WITH_COMPACTION, + SOFT_PENDING_COMPACTION_BYTES_LIMIT, HARD_PENDING_COMPACTION_BYTES_LIMIT, WRITE_STALLS_ENUM_MAX, BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, INTERNAL_CF_STATS_ENUM_MAX, }; @@ -404,8 +419,6 @@ class InternalStats { void IncBytesMoved(int level, uint64_t amount) {} - void RecordLevelNSlowdown(int level, bool soft) {} - void AddCFStats(InternalCFStatsType type, uint64_t value) {} void AddDBStats(InternalDBStatsType type, uint64_t value) {} @@ -416,14 +429,20 @@ class InternalStats { uint64_t BumpAndGetBackgroundErrorCount() { return 0; } - bool GetStringProperty(DBPropertyType property_type, const Slice& property, - std::string* value) { return false; } + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value) { + return false; + } - bool GetIntProperty(DBPropertyType property_type, uint64_t* value, - DBImpl* db) const { return false; } + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db) const { + return false; + } - bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, - uint64_t* value) const { return false; } + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value) const { + return false; + } }; #endif // !ROCKSDB_LITE diff --git a/external/rocksdb/db/job_context.h b/external/rocksdb/db/job_context.h index 5a54e2d85f..286d522b73 100644 --- a/external/rocksdb/db/job_context.h +++ b/external/rocksdb/db/job_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,7 +12,6 @@ #include #include -#include "db/column_family.h" #include "db/log_writer.h" namespace rocksdb { @@ -22,9 +21,9 @@ class MemTable; struct JobContext { inline bool HaveSomethingToDelete() const { return full_scan_candidate_files.size() || sst_delete_files.size() || - log_delete_files.size() || new_superversion != nullptr || - superversions_to_free.size() > 0 || memtables_to_free.size() > 0 || - logs_to_free.size() > 0; + log_delete_files.size() || manifest_delete_files.size() || + new_superversion != nullptr || superversions_to_free.size() > 0 || + memtables_to_free.size() > 0 || logs_to_free.size() > 0; } // Structure to store information for candidate files to delete. @@ -56,6 +55,9 @@ struct JobContext { // a list of log files that we need to delete std::vector log_delete_files; + // a list of manifest files that we need to delete + std::vector manifest_delete_files; + // a list of memtables to be free autovector memtables_to_free; @@ -73,6 +75,9 @@ struct JobContext { uint64_t prev_log_number; uint64_t min_pending_output = 0; + uint64_t prev_total_log_size = 0; + size_t num_alive_log_files = 0; + uint64_t size_log_to_delete = 0; explicit JobContext(int _job_id, bool create_superversion = false) { job_id = _job_id; diff --git a/external/rocksdb/db/listener_test.cc b/external/rocksdb/db/listener_test.cc index ce683a5b3f..000fba6836 100644 --- a/external/rocksdb/db/listener_test.cc +++ b/external/rocksdb/db/listener_test.cc @@ -1,13 +1,15 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #include "db/db_impl.h" +#include "db/db_test_util.h" #include "db/dbformat.h" #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "memtable/hash_linklist_rep.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -22,7 +24,6 @@ #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" #include "util/hash.h" -#include "util/hash_linklist_rep.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" @@ -37,119 +38,45 @@ namespace rocksdb { -class EventListenerTest : public testing::Test { +class EventListenerTest : public DBTestBase { public: - EventListenerTest() { - dbname_ = test::TmpDir() + "/listener_test"; - EXPECT_OK(DestroyDB(dbname_, Options())); - db_ = nullptr; - Reopen(); - } - - ~EventListenerTest() { - Close(); - Options options; - options.db_paths.emplace_back(dbname_, 0); - options.db_paths.emplace_back(dbname_ + "_2", 0); - options.db_paths.emplace_back(dbname_ + "_3", 0); - options.db_paths.emplace_back(dbname_ + "_4", 0); - EXPECT_OK(DestroyDB(dbname_, options)); - } - - void CreateColumnFamilies(const std::vector& cfs, - const ColumnFamilyOptions* options = nullptr) { - ColumnFamilyOptions cf_opts; - cf_opts = ColumnFamilyOptions(Options()); - size_t cfi = handles_.size(); - handles_.resize(cfi + cfs.size()); - for (auto cf : cfs) { - ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); - } - } - - void Close() { - for (auto h : handles_) { - delete h; - } - handles_.clear(); - delete db_; - db_ = nullptr; - } - - void ReopenWithColumnFamilies(const std::vector& cfs, - const Options* options = nullptr) { - ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); - } - - Status TryReopenWithColumnFamilies(const std::vector& cfs, - const Options* options = nullptr) { - Close(); - Options opts = (options == nullptr) ? Options() : *options; - std::vector v_opts(cfs.size(), &opts); - return TryReopenWithColumnFamilies(cfs, v_opts); - } + EventListenerTest() : DBTestBase("/listener_test") {} - Status TryReopenWithColumnFamilies( - const std::vector& cfs, - const std::vector& options) { - Close(); - EXPECT_EQ(cfs.size(), options.size()); - std::vector column_families; - for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i])); - } - DBOptions db_opts = DBOptions(*options[0]); - return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); - } - - Status TryReopen(Options* options = nullptr) { - Close(); - Options opts; - if (options != nullptr) { - opts = *options; - } else { - opts.create_if_missing = true; - } + const size_t k110KB = 110 << 10; +}; - return DB::Open(opts, dbname_, &db_); +struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector { + virtual rocksdb::Status AddUserKey(const rocksdb::Slice& key, + const rocksdb::Slice& value, + rocksdb::EntryType type, + rocksdb::SequenceNumber seq, + uint64_t file_size) override { + return Status::OK(); } - - void Reopen(Options* options = nullptr) { - ASSERT_OK(TryReopen(options)); + virtual rocksdb::Status Finish( + rocksdb::UserCollectedProperties* properties) override { + properties->insert({"0", "1"}); + return Status::OK(); } - void CreateAndReopenWithCF(const std::vector& cfs, - const Options* options = nullptr) { - CreateColumnFamilies(cfs, options); - std::vector cfs_plus_default = cfs; - cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); - ReopenWithColumnFamilies(cfs_plus_default, options); + virtual const char* Name() const override { + return "TestTablePropertiesCollector"; } - DBImpl* dbfull() { - return reinterpret_cast(db_); - } - - Status Put(int cf, const Slice& k, const Slice& v, - WriteOptions wo = WriteOptions()) { - return db_->Put(wo, handles_[cf], k, v); + rocksdb::UserCollectedProperties GetReadableProperties() const override { + rocksdb::UserCollectedProperties ret; + ret["2"] = "3"; + return ret; } +}; - Status Flush(size_t cf = 0) { - FlushOptions opt = FlushOptions(); - opt.wait = true; - if (cf == 0) { - return db_->Flush(opt); - } else { - return db_->Flush(opt, handles_[cf]); - } +class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + return new TestPropertiesCollector; } - - const size_t k110KB = 110 << 10; - - DB* db_; - std::string dbname_; - std::vector handles_; + const char* Name() const override { return "TestTablePropertiesCollector"; } }; class TestCompactionListener : public EventListener { @@ -161,6 +88,16 @@ class TestCompactionListener : public EventListener { ASSERT_GT(ci.output_files.size(), 0U); ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id); ASSERT_GT(ci.thread_id, 0U); + + for (auto fl : {ci.input_files, ci.output_files}) { + for (auto fn : fl) { + auto it = ci.table_properties.find(fn); + ASSERT_NE(it, ci.table_properties.end()); + auto tp = it->second; + ASSERT_TRUE(tp != nullptr); + ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1"); + } + } } std::vector compacted_dbs_; @@ -186,13 +123,15 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { options.enable_thread_tracking = true; #endif // ROCKSDB_USING_THREAD_STATUS options.level0_file_num_compaction_trigger = kNumL0Files; + options.table_properties_collector_factories.push_back( + std::make_shared()); TestCompactionListener* listener = new TestCompactionListener(); options.listeners.emplace_back(listener); std::vector cf_names = { "pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}; - CreateAndReopenWithCF(cf_names, &options); + CreateAndReopenWithCF(cf_names, options); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); @@ -200,12 +139,12 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); - for (size_t i = 1; i < 8; ++i) { + for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); - const Slice kStart = "a"; - const Slice kEnd = "z"; + const Slice kRangeStart = "a"; + const Slice kRangeEnd = "z"; ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], - &kStart, &kEnd)); + &kRangeStart, &kRangeEnd)); dbfull()->TEST_WaitForFlushMemTable(); dbfull()->TEST_WaitForCompact(); } @@ -274,6 +213,8 @@ class TestFlushListener : public EventListener { ASSERT_EQ(prev_fc_info_.file_path, info.file_path); ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); ASSERT_GT(info.thread_id, 0U); + ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second, + "1"); } std::vector flushed_column_family_names_; @@ -299,7 +240,9 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { std::vector cf_names = { "pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}; - CreateAndReopenWithCF(cf_names, &options); + options.table_properties_collector_factories.push_back( + std::make_shared()); + CreateAndReopenWithCF(cf_names, options); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); @@ -308,7 +251,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); - for (size_t i = 1; i < 8; ++i) { + for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); dbfull()->TEST_WaitForFlushMemTable(); ASSERT_EQ(listener->flushed_dbs_.size(), i); @@ -330,10 +273,12 @@ TEST_F(EventListenerTest, MultiCF) { #endif // ROCKSDB_USING_THREAD_STATUS TestFlushListener* listener = new TestFlushListener(options.env); options.listeners.emplace_back(listener); + options.table_properties_collector_factories.push_back( + std::make_shared()); std::vector cf_names = { "pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}; - CreateAndReopenWithCF(cf_names, &options); + CreateAndReopenWithCF(cf_names, options); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); @@ -342,7 +287,7 @@ TEST_F(EventListenerTest, MultiCF) { ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); - for (size_t i = 1; i < 8; ++i) { + for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); ASSERT_EQ(listener->flushed_dbs_.size(), i); ASSERT_EQ(listener->flushed_column_family_names_.size(), i); @@ -360,6 +305,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { #if ROCKSDB_USING_THREAD_STATUS options.enable_thread_tracking = true; #endif // ROCKSDB_USING_THREAD_STATUS + options.table_properties_collector_factories.push_back( + std::make_shared()); std::vector listeners; const int kNumDBs = 5; const int kNumListeners = 10; @@ -454,8 +401,10 @@ TEST_F(EventListenerTest, DisableBGCompaction) { options.compaction_style = kCompactionStyleNone; options.compression = kNoCompression; options.write_buffer_size = 100000; // Small write buffer + options.table_properties_collector_factories.push_back( + std::make_shared()); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ColumnFamilyMetaData cf_meta; db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); @@ -469,7 +418,344 @@ TEST_F(EventListenerTest, DisableBGCompaction) { ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); } -} // namespace rocksdb +class TestCompactionReasonListener : public EventListener { + public: + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + std::lock_guard lock(mutex_); + compaction_reasons_.push_back(ci.compaction_reason); + } + + std::vector compaction_reasons_; + std::mutex mutex_; +}; + +TEST_F(EventListenerTest, CompactionReasonLevel) { + Options options; + options.create_if_missing = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleLevel; + + DestroyAndReopen(options); + Random rnd(301); + + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(listener->compaction_reasons_.size(), 1); + ASSERT_EQ(listener->compaction_reasons_[0], + CompactionReason::kLevelL0FilesNum); + + DestroyAndReopen(options); + + // Write 3 non-overlapping files in L0 + for (int k = 1; k <= 30; k++) { + ASSERT_OK(Put(Key(k), Key(k))); + if (k % 10 == 0) { + Flush(); + } + } + + // Do a trivial move from L0 -> L1 + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + options.max_bytes_for_level_base = 1; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(listener->compaction_reasons_.size(), 1); + + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize); + } + + options.disable_auto_compactions = true; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + Put("key", "value"); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction); + } +} + +TEST_F(EventListenerTest, CompactionReasonUniversal) { + Options options; + options.create_if_missing = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.compaction_style = kCompactionStyleUniversal; + + Random rnd(301); + + options.level0_file_num_compaction_trigger = 8; + options.compaction_options_universal.max_size_amplification_percent = 100000; + options.compaction_options_universal.size_ratio = 100000; + DestroyAndReopen(options); + listener->compaction_reasons_.clear(); + + // Write 8 files in L0 + for (int i = 0; i < 8; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSortedRunNum); + } + + options.level0_file_num_compaction_trigger = 8; + options.compaction_options_universal.max_size_amplification_percent = 1; + options.compaction_options_universal.size_ratio = 100000; + + DestroyAndReopen(options); + listener->compaction_reasons_.clear(); + + // Write 8 files in L0 + for (int i = 0; i < 8; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification); + } + + options.disable_auto_compactions = true; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction); + } +} + +TEST_F(EventListenerTest, CompactionReasonFIFO) { + Options options; + options.create_if_missing = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleFIFO; + options.compaction_options_fifo.max_table_files_size = 1; + + DestroyAndReopen(options); + Random rnd(301); + + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize); + } +} + +class TableFileCreationListener : public EventListener { + public: + class TestEnv : public EnvWrapper { + public: + TestEnv() : EnvWrapper(Env::Default()) {} + + void SetStatus(Status s) { status_ = s; } + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") { + if (!status_.ok()) { + return status_; + } + } + return Env::Default()->NewWritableFile(fname, result, options); + } + + private: + Status status_; + }; + + TableFileCreationListener() { + for (int i = 0; i < 2; i++) { + started_[i] = finished_[i] = failure_[i] = 0; + } + } + + int Index(TableFileCreationReason reason) { + int idx; + switch (reason) { + case TableFileCreationReason::kFlush: + idx = 0; + break; + case TableFileCreationReason::kCompaction: + idx = 1; + break; + default: + idx = -1; + } + return idx; + } + + void CheckAndResetCounters(int flush_started, int flush_finished, + int flush_failure, int compaction_started, + int compaction_finished, int compaction_failure) { + ASSERT_EQ(started_[0], flush_started); + ASSERT_EQ(finished_[0], flush_finished); + ASSERT_EQ(failure_[0], flush_failure); + ASSERT_EQ(started_[1], compaction_started); + ASSERT_EQ(finished_[1], compaction_finished); + ASSERT_EQ(failure_[1], compaction_failure); + for (int i = 0; i < 2; i++) { + started_[i] = finished_[i] = failure_[i] = 0; + } + } + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) override { + int idx = Index(info.reason); + if (idx >= 0) { + started_[idx]++; + } + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + } + + void OnTableFileCreated(const TableFileCreationInfo& info) override { + int idx = Index(info.reason); + if (idx >= 0) { + finished_[idx]++; + } + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + if (info.status.ok()) { + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + } else { + if (idx >= 0) { + failure_[idx]++; + } + } + } + + TestEnv test_env; + int started_[2]; + int finished_[2]; + int failure_[2]; +}; + +TEST_F(EventListenerTest, TableFileCreationListenersTest) { + auto listener = std::make_shared(); + Options options; + options.create_if_missing = true; + options.listeners.push_back(listener); + options.env = &listener->test_env; + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "aaa")); + ASSERT_OK(Put("bar", "bbb")); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); + + ASSERT_OK(Put("foo", "aaa1")); + ASSERT_OK(Put("bar", "bbb1")); + listener->test_env.SetStatus(Status::NotSupported("not supported")); + ASSERT_NOK(Flush()); + listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); + listener->test_env.SetStatus(Status::OK()); + + Reopen(options); + ASSERT_OK(Put("foo", "aaa2")); + ASSERT_OK(Put("bar", "bbb2")); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); + + const Slice kRangeStart = "a"; + const Slice kRangeEnd = "z"; + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); + dbfull()->TEST_WaitForCompact(); + listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0); + + ASSERT_OK(Put("foo", "aaa3")); + ASSERT_OK(Put("bar", "bbb3")); + ASSERT_OK(Flush()); + listener->test_env.SetStatus(Status::NotSupported("not supported")); + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); + dbfull()->TEST_WaitForCompact(); + listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1); +} + +class MemTableSealedListener : public EventListener { +private: + SequenceNumber latest_seq_number_; +public: + MemTableSealedListener() {} + void OnMemTableSealed(const MemTableInfo& info) override { + latest_seq_number_ = info.first_seqno; + } + + void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& flush_job_info) override { + ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_); + } +}; + +TEST_F(EventListenerTest, MemTableSealedListenerTest) { + auto listener = std::make_shared(); + Options options; + options.create_if_missing = true; + options.listeners.push_back(listener); + DestroyAndReopen(options); + + for (unsigned int i = 0; i < 10; i++) { + std::string tag = std::to_string(i); + ASSERT_OK(Put("foo"+tag, "aaa")); + ASSERT_OK(Put("bar"+tag, "bbb")); + + ASSERT_OK(Flush()); + } +} +} // namespace rocksdb + #endif // ROCKSDB_LITE @@ -477,4 +763,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - diff --git a/external/rocksdb/db/log_format.h b/external/rocksdb/db/log_format.h index 919c087e24..cf48a202f4 100644 --- a/external/rocksdb/db/log_format.h +++ b/external/rocksdb/db/log_format.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,14 +22,24 @@ enum RecordType { // For fragments kFirstType = 2, kMiddleType = 3, - kLastType = 4 + kLastType = 4, + + // For recycled log files + kRecyclableFullType = 5, + kRecyclableFirstType = 6, + kRecyclableMiddleType = 7, + kRecyclableLastType = 8, }; -static const int kMaxRecordType = kLastType; +static const int kMaxRecordType = kRecyclableLastType; static const unsigned int kBlockSize = 32768; // Header is checksum (4 bytes), type (1 byte), length (2 bytes). static const int kHeaderSize = 4 + 1 + 2; +// Recyclable header is checksum (4 bytes), type (1 byte), log number +// (4 bytes), length (2 bytes). +static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2; + } // namespace log } // namespace rocksdb diff --git a/external/rocksdb/db/log_reader.cc b/external/rocksdb/db/log_reader.cc index 296f1d50c0..2da16a2863 100644 --- a/external/rocksdb/db/log_reader.cc +++ b/external/rocksdb/db/log_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,9 +21,11 @@ namespace log { Reader::Reporter::~Reporter() { } -Reader::Reader(unique_ptr&& _file, Reporter* reporter, - bool checksum, uint64_t initial_offset) - : file_(std::move(_file)), +Reader::Reader(std::shared_ptr info_log, + unique_ptr&& _file, Reporter* reporter, + bool checksum, uint64_t initial_offset, uint64_t log_num) + : info_log_(info_log), + file_(std::move(_file)), reporter_(reporter), checksum_(checksum), backing_store_(new char[kBlockSize]), @@ -33,7 +35,9 @@ Reader::Reader(unique_ptr&& _file, Reporter* reporter, eof_offset_(0), last_record_offset_(0), end_of_buffer_offset_(0), - initial_offset_(initial_offset) {} + initial_offset_(initial_offset), + log_number_(log_num), + recycled_(false) {} Reader::~Reader() { delete[] backing_store_; @@ -62,8 +66,15 @@ bool Reader::SkipToInitialBlock() { return true; } +// For kAbsoluteConsistency, on clean shutdown we don't expect any error +// in the log files. For other modes, we can ignore only incomplete records +// in the last log file, which are presumably due to a write in progress +// during restart (or from log recycling). +// +// TODO krad: Evaluate if we need to move to a more strict mode where we +// restrict the inconsistency to only the last log bool Reader::ReadRecord(Slice* record, std::string* scratch, - const bool report_eof_inconsistency) { + WALRecoveryMode wal_recovery_mode) { if (last_record_offset_ < initial_offset_) { if (!SkipToInitialBlock()) { return false; @@ -80,10 +91,11 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, Slice fragment; while (true) { uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); - const unsigned int record_type = - ReadPhysicalRecord(&fragment, report_eof_inconsistency); + size_t drop_size = 0; + const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size); switch (record_type) { case kFullType: + case kRecyclableFullType: if (in_fragmented_record && !scratch->empty()) { // Handle bug in earlier versions of log::Writer where // it could emit an empty kFirstType record at the tail end @@ -98,6 +110,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, return true; case kFirstType: + case kRecyclableFirstType: if (in_fragmented_record && !scratch->empty()) { // Handle bug in earlier versions of log::Writer where // it could emit an empty kFirstType record at the tail end @@ -111,6 +124,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, break; case kMiddleType: + case kRecyclableMiddleType: if (!in_fragmented_record) { ReportCorruption(fragment.size(), "missing start of fragmented record(1)"); @@ -120,6 +134,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, break; case kLastType: + case kRecyclableLastType: if (!in_fragmented_record) { ReportCorruption(fragment.size(), "missing start of fragmented record(2)"); @@ -131,9 +146,17 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } break; + case kBadHeader: + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(drop_size, "truncated header"); + } + // fall-thru + case kEof: if (in_fragmented_record) { - if (report_eof_inconsistency) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files ReportCorruption(scratch->size(), "error reading trailing data"); } // This can be caused by the writer dying immediately after @@ -143,6 +166,23 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } return false; + case kOldRecord: + if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { + // Treat a record from a previous instance of the log as EOF. + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + } + // fall-thru + case kBadRecord: if (in_fragmented_record) { ReportCorruption(scratch->size(), "error in middle of record"); @@ -151,6 +191,26 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } break; + case kBadRecordLen: + case kBadRecordChecksum: + if (recycled_ && + wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords) { + scratch->clear(); + return false; + } + if (record_type == kBadRecordLen) { + ReportCorruption(drop_size, "bad record length"); + } else { + ReportCorruption(drop_size, "checksum mismatch"); + } + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + default: { char buf[40]; snprintf(buf, sizeof(buf), "unknown record type %u", record_type); @@ -244,36 +304,49 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) { } } -unsigned int Reader::ReadPhysicalRecord(Slice* result, - const bool report_eof_inconsistency) { +bool Reader::ReadMore(size_t* drop_size, int *error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < (size_t)kBlockSize) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + return true; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Unless explicitly requested we don't + // considering this an error, just report EOF. + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + *error = kBadHeader; + return false; + } + buffer_.clear(); + *error = kEof; + return false; + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { while (true) { + // We need at least the minimum header size if (buffer_.size() < (size_t)kHeaderSize) { - if (!eof_ && !read_error_) { - // Last read was a full read, so this is a trailer to skip - buffer_.clear(); - Status status = file_->Read(kBlockSize, &buffer_, backing_store_); - end_of_buffer_offset_ += buffer_.size(); - if (!status.ok()) { - buffer_.clear(); - ReportDrop(kBlockSize, status); - read_error_ = true; - return kEof; - } else if (buffer_.size() < (size_t)kBlockSize) { - eof_ = true; - eof_offset_ = buffer_.size(); - } - continue; - } else { - // Note that if buffer_ is non-empty, we have a truncated header at the - // end of the file, which can be caused by the writer crashing in the - // middle of writing the header. Unless explicitly requested we don't - // considering this an error, just report EOF. - if (buffer_.size() && report_eof_inconsistency) { - ReportCorruption(buffer_.size(), "truncated header"); - } - buffer_.clear(); - return kEof; + int r; + if (!ReadMore(drop_size, &r)) { + return r; } + continue; } // Parse the header @@ -282,18 +355,36 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, const uint32_t b = static_cast(header[5]) & 0xff; const unsigned int type = header[6]; const uint32_t length = a | (b << 8); - if (kHeaderSize + length > buffer_.size()) { - size_t drop_size = buffer_.size(); + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + // We need enough for the larger header + if (buffer_.size() < (size_t)kRecyclableHeaderSize) { + int r; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + return kOldRecord; + } + } + if (header_size + length > buffer_.size()) { + *drop_size = buffer_.size(); buffer_.clear(); if (!eof_) { - ReportCorruption(drop_size, "bad record length"); - return kBadRecord; + return kBadRecordLen; } // If the end of the file has been reached without reading |length| bytes // of payload, assume the writer died in the middle of writing the record. // Don't report a corruption unless requested. - if (drop_size && report_eof_inconsistency) { - ReportCorruption(drop_size, "truncated header"); + if (*drop_size) { + return kBadHeader; } return kEof; } @@ -311,29 +402,28 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, // Check crc if (checksum_) { uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); - uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); if (actual_crc != expected_crc) { // Drop the rest of the buffer since "length" itself may have // been corrupted and if we trust it, we could find some // fragment of a real log record that just happens to look // like a valid log record. - size_t drop_size = buffer_.size(); + *drop_size = buffer_.size(); buffer_.clear(); - ReportCorruption(drop_size, "checksum mismatch"); - return kBadRecord; + return kBadRecordChecksum; } } - buffer_.remove_prefix(kHeaderSize + length); + buffer_.remove_prefix(header_size + length); // Skip physical record that started before initial_offset_ - if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < + if (end_of_buffer_offset_ - buffer_.size() - header_size - length < initial_offset_) { result->clear(); return kBadRecord; } - *result = Slice(header + kHeaderSize, length); + *result = Slice(header + header_size, length); return type; } } diff --git a/external/rocksdb/db/log_reader.h b/external/rocksdb/db/log_reader.h index 390696b854..4451185468 100644 --- a/external/rocksdb/db/log_reader.h +++ b/external/rocksdb/db/log_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,10 +14,12 @@ #include "db/log_format.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "rocksdb/options.h" namespace rocksdb { class SequentialFileReader; +class Logger; using std::unique_ptr; namespace log { @@ -51,8 +53,10 @@ class Reader { // // The Reader will start reading at the first record located at physical // position >= initial_offset within the file. - Reader(unique_ptr&& file, Reporter* reporter, - bool checksum, uint64_t initial_offset); + Reader(std::shared_ptr info_log, + unique_ptr&& file, + Reporter* reporter, bool checksum, uint64_t initial_offset, + uint64_t log_num); ~Reader(); @@ -62,7 +66,8 @@ class Reader { // will only be valid until the next mutating operation on this // reader or the next mutation to *scratch. bool ReadRecord(Slice* record, std::string* scratch, - bool report_eof_inconsistency = false); + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords); // Returns the physical offset of the last record returned by ReadRecord. // @@ -84,6 +89,7 @@ class Reader { SequentialFileReader* file() { return file_.get(); } private: + std::shared_ptr info_log_; const unique_ptr file_; Reporter* const reporter_; bool const checksum_; @@ -104,6 +110,12 @@ class Reader { // Offset at which to start looking for the first record to return uint64_t const initial_offset_; + // which log number this is + uint64_t const log_number_; + + // Whether this is a recycled log file + bool recycled_; + // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, @@ -112,7 +124,15 @@ class Reader { // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) // * The record is a 0-length record (No drop is reported) // * The record is below constructor's initial_offset (No drop is reported) - kBadRecord = kMaxRecordType + 2 + kBadRecord = kMaxRecordType + 2, + // Returned when we fail to read a valid header. + kBadHeader = kMaxRecordType + 3, + // Returned when we read an old record from a previous user of the log. + kOldRecord = kMaxRecordType + 4, + // Returned when we get a bad record length + kBadRecordLen = kMaxRecordType + 5, + // Returned when we get a bad record checksum + kBadRecordChecksum = kMaxRecordType + 6, }; // Skips all blocks that are completely before "initial_offset_". @@ -121,8 +141,10 @@ class Reader { bool SkipToInitialBlock(); // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord(Slice* result, - bool report_eof_inconsistency = false); + unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); + + // Read some more + bool ReadMore(size_t* drop_size, int *error); // Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. diff --git a/external/rocksdb/db/log_test.cc b/external/rocksdb/db/log_test.cc index 5ab41f2510..c92f09137c 100644 --- a/external/rocksdb/db/log_test.cc +++ b/external/rocksdb/db/log_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -43,7 +43,7 @@ static std::string RandomSkewedString(int i, Random* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } -class LogTest : public testing::Test { +class LogTest : public ::testing::TestWithParam { private: class StringSource : public SequentialFile { public: @@ -153,19 +153,28 @@ class LogTest : public testing::Test { // Record metadata for testing initial offset functionality static size_t initial_offset_record_sizes_[]; - static uint64_t initial_offset_last_record_offsets_[]; + uint64_t initial_offset_last_record_offsets_[4]; public: LogTest() : reader_contents_(), - dest_holder_( - test::GetWritableFileWriter( - new test::StringSink(&reader_contents_))), + dest_holder_(test::GetWritableFileWriter( + new test::StringSink(&reader_contents_))), source_holder_( test::GetSequentialFileReader(new StringSource(reader_contents_))), - writer_(std::move(dest_holder_)), - reader_(std::move(source_holder_), &report_, true /*checksum*/, - 0 /*initial_offset*/) {} + writer_(std::move(dest_holder_), 123, GetParam()), + reader_(NULL, std::move(source_holder_), &report_, true /*checksum*/, + 0 /*initial_offset*/, 123) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + initial_offset_last_record_offsets_[0] = 0; + initial_offset_last_record_offsets_[1] = header_size + 10000; + initial_offset_last_record_offsets_[2] = 2 * (header_size + 10000); + initial_offset_last_record_offsets_[3] = 2 * (header_size + 10000) + + (2 * log::kBlockSize - 1000) + + 3 * header_size; + } + + Slice* get_reader_contents() { return &reader_contents_; } void Write(const std::string& msg) { writer_.AddRecord(Slice(msg)); @@ -175,10 +184,11 @@ class LogTest : public testing::Test { return dest_contents().size(); } - std::string Read(const bool report_eof_inconsistency = false) { + std::string Read(const WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords) { std::string scratch; Slice record; - if (reader_.ReadRecord(&record, &scratch, report_eof_inconsistency)) { + if (reader_.ReadRecord(&record, &scratch, wal_recovery_mode)) { return record.ToString(); } else { return "EOF"; @@ -200,9 +210,11 @@ class LogTest : public testing::Test { dest->Drop(bytes); } - void FixChecksum(int header_offset, int len) { + void FixChecksum(int header_offset, int len, bool recyclable) { // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len); + int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize; + uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6], + header_size - 6 + len); crc = crc32c::Mask(crc); EncodeFixed32(&dest_contents()[header_offset], crc); } @@ -259,8 +271,8 @@ class LogTest : public testing::Test { unique_ptr file_reader( test::GetSequentialFileReader(new StringSource(reader_contents_))); unique_ptr offset_reader( - new Reader(std::move(file_reader), &report_, true /*checksum*/, - WrittenBytes() + offset_past_end)); + new Reader(NULL, std::move(file_reader), &report_, + true /*checksum*/, WrittenBytes() + offset_past_end, 123)); Slice record; std::string scratch; ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); @@ -271,8 +283,9 @@ class LogTest : public testing::Test { WriteInitialOffsetLog(); unique_ptr file_reader( test::GetSequentialFileReader(new StringSource(reader_contents_))); - unique_ptr offset_reader(new Reader( - std::move(file_reader), &report_, true /*checksum*/, initial_offset)); + unique_ptr offset_reader( + new Reader(NULL, std::move(file_reader), &report_, + true /*checksum*/, initial_offset, 123)); Slice record; std::string scratch; ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); @@ -291,16 +304,9 @@ size_t LogTest::initial_offset_record_sizes_[] = 2 * log::kBlockSize - 1000, // Span three blocks 1}; -uint64_t LogTest::initial_offset_last_record_offsets_[] = - {0, - kHeaderSize + 10000, - 2 * (kHeaderSize + 10000), - 2 * (kHeaderSize + 10000) + - (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; +TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } - -TEST_F(LogTest, ReadWrite) { +TEST_P(LogTest, ReadWrite) { Write("foo"); Write("bar"); Write(""); @@ -313,7 +319,7 @@ TEST_F(LogTest, ReadWrite) { ASSERT_EQ("EOF", Read()); // Make sure reads at eof work } -TEST_F(LogTest, ManyBlocks) { +TEST_P(LogTest, ManyBlocks) { for (int i = 0; i < 100000; i++) { Write(NumberString(i)); } @@ -323,7 +329,7 @@ TEST_F(LogTest, ManyBlocks) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, Fragmentation) { +TEST_P(LogTest, Fragmentation) { Write("small"); Write(BigString("medium", 50000)); Write(BigString("large", 100000)); @@ -333,11 +339,12 @@ TEST_F(LogTest, Fragmentation) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, MarginalTrailer) { +TEST_P(LogTest, MarginalTrailer) { // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); Write(""); Write("bar"); ASSERT_EQ(BigString("foo", n), Read()); @@ -346,11 +353,12 @@ TEST_F(LogTest, MarginalTrailer) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, MarginalTrailer2) { +TEST_P(LogTest, MarginalTrailer2) { // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); Write("bar"); ASSERT_EQ(BigString("foo", n), Read()); ASSERT_EQ("bar", Read()); @@ -359,10 +367,11 @@ TEST_F(LogTest, MarginalTrailer2) { ASSERT_EQ("", ReportMessage()); } -TEST_F(LogTest, ShortTrailer) { - const int n = kBlockSize - 2*kHeaderSize + 4; +TEST_P(LogTest, ShortTrailer) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); Write(""); Write("bar"); ASSERT_EQ(BigString("foo", n), Read()); @@ -371,15 +380,16 @@ TEST_F(LogTest, ShortTrailer) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, AlignedEof) { - const int n = kBlockSize - 2*kHeaderSize + 4; +TEST_P(LogTest, AlignedEof) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); ASSERT_EQ(BigString("foo", n), Read()); ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, RandomRead) { +TEST_P(LogTest, RandomRead) { const int N = 500; Random write_rnd(301); for (int i = 0; i < N; i++) { @@ -394,7 +404,7 @@ TEST_F(LogTest, RandomRead) { // Tests of all the error paths in log_reader.cc follow: -TEST_F(LogTest, ReadError) { +TEST_P(LogTest, ReadError) { Write("foo"); ForceError(); ASSERT_EQ("EOF", Read()); @@ -402,17 +412,17 @@ TEST_F(LogTest, ReadError) { ASSERT_EQ("OK", MatchError("read error")); } -TEST_F(LogTest, BadRecordType) { +TEST_P(LogTest, BadRecordType) { Write("foo"); // Type is stored in header[6] IncrementByte(6, 100); - FixChecksum(0, 3); + FixChecksum(0, 3, false); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("unknown record type")); } -TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) { +TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte ASSERT_EQ("EOF", Read()); @@ -421,27 +431,32 @@ TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) { ASSERT_EQ("", ReportMessage()); } -TEST_F(LogTest, TruncatedTrailingRecordIsNotIgnored) { +TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency*/ true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); // Truncated last record is ignored, not treated as an error ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError("Corruption: truncated header")); } -TEST_F(LogTest, BadLength) { - const int kPayloadSize = kBlockSize - kHeaderSize; +TEST_P(LogTest, BadLength) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int kPayloadSize = kBlockSize - header_size; Write(BigString("bar", kPayloadSize)); Write("foo"); // Least significant size byte is stored in header[4]. IncrementByte(4, 1); - ASSERT_EQ("foo", Read()); - ASSERT_EQ(kBlockSize, DroppedBytes()); - ASSERT_EQ("OK", MatchError("bad record length")); + if (!GetParam()) { + ASSERT_EQ("foo", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); + } else { + ASSERT_EQ("EOF", Read()); + } } -TEST_F(LogTest, BadLengthAtEndIsIgnored) { +TEST_P(LogTest, BadLengthAtEndIsIgnored) { Write("foo"); ShrinkSize(1); ASSERT_EQ("EOF", Read()); @@ -449,63 +464,68 @@ TEST_F(LogTest, BadLengthAtEndIsIgnored) { ASSERT_EQ("", ReportMessage()); } -TEST_F(LogTest, BadLengthAtEndIsNotIgnored) { +TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { Write("foo"); ShrinkSize(1); - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError("Corruption: truncated header")); } -TEST_F(LogTest, ChecksumMismatch) { - Write("foo"); - IncrementByte(0, 10); +TEST_P(LogTest, ChecksumMismatch) { + Write("foooooo"); + IncrementByte(0, 14); ASSERT_EQ("EOF", Read()); - ASSERT_EQ(10U, DroppedBytes()); - ASSERT_EQ("OK", MatchError("checksum mismatch")); + if (!GetParam()) { + ASSERT_EQ(14U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); + } else { + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); + } } -TEST_F(LogTest, UnexpectedMiddleType) { +TEST_P(LogTest, UnexpectedMiddleType) { Write("foo"); - SetByte(6, kMiddleType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableMiddleType : kMiddleType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("missing start")); } -TEST_F(LogTest, UnexpectedLastType) { +TEST_P(LogTest, UnexpectedLastType) { Write("foo"); - SetByte(6, kLastType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableLastType : kLastType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("missing start")); } -TEST_F(LogTest, UnexpectedFullType) { +TEST_P(LogTest, UnexpectedFullType) { Write("foo"); Write("bar"); - SetByte(6, kFirstType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ("bar", Read()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("partial record without end")); } -TEST_F(LogTest, UnexpectedFirstType) { +TEST_P(LogTest, UnexpectedFirstType) { Write("foo"); Write(BigString("bar", 100000)); - SetByte(6, kFirstType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ(BigString("bar", 100000), Read()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("partial record without end")); } -TEST_F(LogTest, MissingLastIsIgnored) { +TEST_P(LogTest, MissingLastIsIgnored) { Write(BigString("bar", kBlockSize)); // Remove the LAST block, including header. ShrinkSize(14); @@ -514,16 +534,16 @@ TEST_F(LogTest, MissingLastIsIgnored) { ASSERT_EQ(0U, DroppedBytes()); } -TEST_F(LogTest, MissingLastIsNotIgnored) { +TEST_P(LogTest, MissingLastIsNotIgnored) { Write(BigString("bar", kBlockSize)); // Remove the LAST block, including header. ShrinkSize(14); - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data")); } -TEST_F(LogTest, PartialLastIsIgnored) { +TEST_P(LogTest, PartialLastIsIgnored) { Write(BigString("bar", kBlockSize)); // Cause a bad record length in the LAST block. ShrinkSize(1); @@ -532,18 +552,18 @@ TEST_F(LogTest, PartialLastIsIgnored) { ASSERT_EQ(0U, DroppedBytes()); } -TEST_F(LogTest, PartialLastIsNotIgnored) { +TEST_P(LogTest, PartialLastIsNotIgnored) { Write(BigString("bar", kBlockSize)); // Cause a bad record length in the LAST block. ShrinkSize(1); - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError( "Corruption: truncated headerCorruption: " "error reading trailing data")); } -TEST_F(LogTest, ErrorJoinsRecords) { +TEST_P(LogTest, ErrorJoinsRecords) { // Consider two fragmented records: // first(R1) last(R1) first(R2) last(R2) // where the middle two fragments disappear. We do not want @@ -559,53 +579,71 @@ TEST_F(LogTest, ErrorJoinsRecords) { SetByte(offset, 'x'); } - ASSERT_EQ("correct", Read()); - ASSERT_EQ("EOF", Read()); - size_t dropped = DroppedBytes(); - ASSERT_LE(dropped, 2 * kBlockSize + 100); - ASSERT_GE(dropped, 2 * kBlockSize); + if (!GetParam()) { + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + size_t dropped = DroppedBytes(); + ASSERT_LE(dropped, 2 * kBlockSize + 100); + ASSERT_GE(dropped, 2 * kBlockSize); + } else { + ASSERT_EQ("EOF", Read()); + } } -TEST_F(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); } +TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); } -TEST_F(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); } +TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); } -TEST_F(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); } +TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); } -TEST_F(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); } +TEST_P(LogTest, ReadSecondStart) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(10000 + header_size, 1); +} -TEST_F(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); } +TEST_P(LogTest, ReadThirdOneOff) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(10000 + header_size + 1, 2); +} -TEST_F(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); } +TEST_P(LogTest, ReadThirdStart) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(20000 + 2 * header_size, 2); +} -TEST_F(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); } +TEST_P(LogTest, ReadFourthOneOff) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3); +} -TEST_F(LogTest, ReadFourthFirstBlockTrailer) { +TEST_P(LogTest, ReadFourthFirstBlockTrailer) { CheckInitialOffsetRecord(log::kBlockSize - 4, 3); } -TEST_F(LogTest, ReadFourthMiddleBlock) { +TEST_P(LogTest, ReadFourthMiddleBlock) { CheckInitialOffsetRecord(log::kBlockSize + 1, 3); } -TEST_F(LogTest, ReadFourthLastBlock) { +TEST_P(LogTest, ReadFourthLastBlock) { CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); } -TEST_F(LogTest, ReadFourthStart) { +TEST_P(LogTest, ReadFourthStart) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; CheckInitialOffsetRecord( - 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size, 3); } -TEST_F(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } +TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } -TEST_F(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } +TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } -TEST_F(LogTest, ClearEofSingleBlock) { +TEST_P(LogTest, ClearEofSingleBlock) { Write("foo"); Write("bar"); - ForceEOF(3 + kHeaderSize + 2); + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + ForceEOF(3 + header_size + 2); ASSERT_EQ("foo", Read()); UnmarkEOF(); ASSERT_EQ("bar", Read()); @@ -617,12 +655,13 @@ TEST_F(LogTest, ClearEofSingleBlock) { ASSERT_TRUE(IsEOF()); } -TEST_F(LogTest, ClearEofMultiBlock) { +TEST_P(LogTest, ClearEofMultiBlock) { size_t num_full_blocks = 5; - size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + size_t n = (kBlockSize - header_size) * num_full_blocks + 25; Write(BigString("foo", n)); Write(BigString("bar", n)); - ForceEOF(n + num_full_blocks * kHeaderSize + 10); + ForceEOF(n + num_full_blocks * header_size + header_size + 3); ASSERT_EQ(BigString("foo", n), Read()); ASSERT_TRUE(IsEOF()); UnmarkEOF(); @@ -634,7 +673,7 @@ TEST_F(LogTest, ClearEofMultiBlock) { ASSERT_TRUE(IsEOF()); } -TEST_F(LogTest, ClearEofError) { +TEST_P(LogTest, ClearEofError) { // If an error occurs during Read() in UnmarkEOF(), the records contained // in the buffer should be returned on subsequent calls of ReadRecord() // until no more full records are left, whereafter ReadRecord() should return @@ -652,7 +691,7 @@ TEST_F(LogTest, ClearEofError) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, ClearEofError2) { +TEST_P(LogTest, ClearEofError2) { Write("foo"); Write("bar"); UnmarkEOF(); @@ -666,6 +705,31 @@ TEST_F(LogTest, ClearEofError2) { ASSERT_EQ("OK", MatchError("read error")); } +TEST_P(LogTest, Recycle) { + if (!GetParam()) { + return; // test is only valid for recycled logs + } + Write("foo"); + Write("bar"); + Write("baz"); + Write("bif"); + Write("blitz"); + while (get_reader_contents()->size() < log::kBlockSize * 2) { + Write("xxxxxxxxxxxxxxxx"); + } + unique_ptr dest_holder(test::GetWritableFileWriter( + new test::OverwritingStringSink(get_reader_contents()))); + Writer recycle_writer(std::move(dest_holder), 123, true); + recycle_writer.AddRecord(Slice("foooo")); + recycle_writer.AddRecord(Slice("bar")); + ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); + ASSERT_EQ("foooo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2)); + } // namespace log } // namespace rocksdb diff --git a/external/rocksdb/db/log_writer.cc b/external/rocksdb/db/log_writer.cc index 32d4afdc96..3277088bef 100644 --- a/external/rocksdb/db/log_writer.cc +++ b/external/rocksdb/db/log_writer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,8 +18,12 @@ namespace rocksdb { namespace log { -Writer::Writer(unique_ptr&& dest) - : dest_(std::move(dest)), block_offset_(0) { +Writer::Writer(unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files) + : dest_(std::move(dest)), + block_offset_(0), + log_number_(log_number), + recycle_log_files_(recycle_log_files) { for (int i = 0; i <= kMaxRecordType; i++) { char t = static_cast(i); type_crc_[i] = crc32c::Value(&t, 1); @@ -33,40 +37,46 @@ Status Writer::AddRecord(const Slice& slice) { const char* ptr = slice.data(); size_t left = slice.size(); + // Header size varies depending on whether we are recycling or not. + const int header_size = + recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize; + // Fragment the record if necessary and emit it. Note that if slice // is empty, we still want to iterate once to emit a single // zero-length record Status s; bool begin = true; do { - const int leftover = kBlockSize - block_offset_; + const int64_t leftover = kBlockSize - block_offset_; assert(leftover >= 0); - if (leftover < kHeaderSize) { + if (leftover < header_size) { // Switch to a new block if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize being 7) - assert(kHeaderSize == 7); - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + dest_->Append( + Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", leftover)); } block_offset_ = 0; } - // Invariant: we never leave < kHeaderSize bytes in a block. - assert(static_cast(kBlockSize) - block_offset_ >= kHeaderSize); + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize - block_offset_) >= header_size); - const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t avail = kBlockSize - block_offset_ - header_size; const size_t fragment_length = (left < avail) ? left : avail; RecordType type; const bool end = (left == fragment_length); if (begin && end) { - type = kFullType; + type = recycle_log_files_ ? kRecyclableFullType : kFullType; } else if (begin) { - type = kFirstType; + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; } else if (end) { - type = kLastType; + type = recycle_log_files_ ? kRecyclableLastType : kLastType; } else { - type = kMiddleType; + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; } s = EmitPhysicalRecord(type, ptr, fragment_length); @@ -79,28 +89,48 @@ Status Writer::AddRecord(const Slice& slice) { Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + size_t header_size; + char buf[kRecyclableHeaderSize]; // Format the header - char buf[kHeaderSize]; buf[4] = static_cast(n & 0xff); buf[5] = static_cast(n >> 8); buf[6] = static_cast(t); + uint32_t crc = type_crc_[t]; + if (t < kRecyclableFullType) { + // Legacy record format + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + header_size = kHeaderSize; + } else { + // Recyclable record format + assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize); + header_size = kRecyclableHeaderSize; + + // Only encode low 32-bits of the 64-bit log number. This means + // we will fail to detect an old record if we recycled a log from + // ~4 billion logs ago, but that is effectively impossible, and + // even if it were we'dbe far more likely to see a false positive + // on the 32-bit CRC. + EncodeFixed32(buf + 7, static_cast(log_number_)); + crc = crc32c::Extend(crc, buf + 7, 4); + } + // Compute the crc of the record type and the payload. - uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); - crc = crc32c::Mask(crc); // Adjust for storage + crc = crc32c::Extend(crc, ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage EncodeFixed32(buf, crc); // Write the header and the payload - Status s = dest_->Append(Slice(buf, kHeaderSize)); + Status s = dest_->Append(Slice(buf, header_size)); if (s.ok()) { s = dest_->Append(Slice(ptr, n)); if (s.ok()) { s = dest_->Flush(); } } - block_offset_ += kHeaderSize + n; + block_offset_ += header_size + n; return s; } diff --git a/external/rocksdb/db/log_writer.h b/external/rocksdb/db/log_writer.h index 6b59bbdd56..11267b33d9 100644 --- a/external/rocksdb/db/log_writer.h +++ b/external/rocksdb/db/log_writer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -43,7 +43,7 @@ namespace log { * Data is written out in kBlockSize chunks. If next record does not fit * into the space left, the leftover space will be padded with \0. * - * Record format: + * Legacy record format: * * +---------+-----------+-----------+--- ... ---+ * |CRC (4B) | Size (2B) | Type (1B) | Payload | @@ -57,13 +57,23 @@ namespace log { * blocks that are larger than kBlockSize * Payload = Byte stream as long as specified by the payload size * + * Recyclable record format: + * + * +---------+-----------+-----------+----------------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | + * +---------+-----------+-----------+----------------+--- ... ---+ + * + * Same as above, with the addition of + * Log number = 32bit log file number, so that we can distinguish between + * records written by the most recent log writer vs a previous one. */ class Writer { public: // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. - explicit Writer(unique_ptr&& dest); + explicit Writer(unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files); ~Writer(); Status AddRecord(const Slice& slice); @@ -71,9 +81,13 @@ class Writer { WritableFileWriter* file() { return dest_.get(); } const WritableFileWriter* file() const { return dest_.get(); } + uint64_t get_log_number() const { return log_number_; } + private: unique_ptr dest_; - int block_offset_; // Current offset in block + size_t block_offset_; // Current offset in block + uint64_t log_number_; + bool recycle_log_files_; // crc32c values for all supported record types. These are // pre-computed to reduce the overhead of computing the crc of the diff --git a/external/rocksdb/db/managed_iterator.cc b/external/rocksdb/db/managed_iterator.cc index 45faeba4ee..1d47f933df 100644 --- a/external/rocksdb/db/managed_iterator.cc +++ b/external/rocksdb/db/managed_iterator.cc @@ -1,10 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #ifndef ROCKSDB_LITE +#include "db/managed_iterator.h" + #include #include #include @@ -13,7 +15,7 @@ #include "db/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" -#include "db/managed_iterator.h" +#include "db/xfunc_test_points.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" @@ -77,7 +79,7 @@ ManagedIterator::ManagedIterator(DBImpl* db, const ReadOptions& read_options, release_supported_(true) { read_options_.managed = false; if ((!read_options_.tailing) && (read_options_.snapshot == nullptr)) { - assert(read_options_.snapshot = db_->GetSnapshot()); + assert(nullptr != (read_options_.snapshot = db_->GetSnapshot())); snapshot_created_ = true; } cfh_.SetCFD(cfd); @@ -208,7 +210,8 @@ void ManagedIterator::RebuildIterator() { void ManagedIterator::UpdateCurrent() { assert(mutable_iter_ != nullptr); - if (!(valid_ = mutable_iter_->Valid())) { + valid_ = mutable_iter_->Valid(); + if (!valid_) { status_ = mutable_iter_->status(); return; } diff --git a/external/rocksdb/db/managed_iterator.h b/external/rocksdb/db/managed_iterator.h index 00f56aea48..d9a87596ea 100644 --- a/external/rocksdb/db/managed_iterator.h +++ b/external/rocksdb/db/managed_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/util/manual_compaction_test.cc b/external/rocksdb/db/manual_compaction_test.cc similarity index 98% rename from external/rocksdb/util/manual_compaction_test.cc rename to external/rocksdb/db/manual_compaction_test.cc index 8613b7b365..0ff52d184d 100644 --- a/external/rocksdb/util/manual_compaction_test.cc +++ b/external/rocksdb/db/manual_compaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/memtable.cc b/external/rocksdb/db/memtable.cc index 54c119ee2f..d8e35a289c 100644 --- a/external/rocksdb/db/memtable.cc +++ b/external/rocksdb/db/memtable.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,12 +15,15 @@ #include "db/dbformat.h" #include "db/merge_context.h" -#include "db/writebuffer.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/internal_iterator.h" #include "table/merger.h" #include "util/arena.h" #include "util/coding.h" @@ -32,35 +35,36 @@ namespace rocksdb { -MemTableOptions::MemTableOptions( - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options) - : write_buffer_size(mutable_cf_options.write_buffer_size), - arena_block_size(mutable_cf_options.arena_block_size), - memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits), - memtable_prefix_bloom_probes( - mutable_cf_options.memtable_prefix_bloom_probes), - memtable_prefix_bloom_huge_page_tlb_size( - mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size), - inplace_update_support(ioptions.inplace_update_support), - inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), - inplace_callback(ioptions.inplace_callback), - max_successive_merges(mutable_cf_options.max_successive_merges), - filter_deletes(mutable_cf_options.filter_deletes), - statistics(ioptions.statistics), - merge_operator(ioptions.merge_operator), - info_log(ioptions.info_log) {} +MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) + : write_buffer_size(mutable_cf_options.write_buffer_size), + arena_block_size(mutable_cf_options.arena_block_size), + memtable_prefix_bloom_bits( + static_cast( + static_cast(mutable_cf_options.write_buffer_size) * + mutable_cf_options.memtable_prefix_bloom_size_ratio) * + 8u), + memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size), + inplace_update_support(ioptions.inplace_update_support), + inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), + inplace_callback(ioptions.inplace_callback), + max_successive_merges(mutable_cf_options.max_successive_merges), + statistics(ioptions.statistics), + merge_operator(ioptions.merge_operator), + info_log(ioptions.info_log) {} MemTable::MemTable(const InternalKeyComparator& cmp, const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, - WriteBuffer* write_buffer, SequenceNumber earliest_seq) + WriteBufferManager* write_buffer_manager, + SequenceNumber earliest_seq) : comparator_(cmp), moptions_(ioptions, mutable_cf_options), refs_(0), kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), - arena_(moptions_.arena_block_size), - allocator_(&arena_, write_buffer), + arena_(moptions_.arena_block_size, + mutable_cf_options.memtable_huge_page_size), + allocator_(&arena_, write_buffer_manager), table_(ioptions.memtable_factory->CreateMemTableRep( comparator_, &allocator_, ioptions.prefix_extractor, ioptions.info_log)), @@ -73,23 +77,22 @@ MemTable::MemTable(const InternalKeyComparator& cmp, first_seqno_(0), earliest_seqno_(earliest_seq), mem_next_logfile_number_(0), + min_prep_log_referenced_(0), locks_(moptions_.inplace_update_support ? moptions_.inplace_update_num_locks : 0), prefix_extractor_(ioptions.prefix_extractor), - should_flush_(ShouldFlushNow()), - flush_scheduled_(false), + flush_state_(FLUSH_NOT_REQUESTED), env_(ioptions.env) { - // if should_flush_ == true without an entry inserted, something must have - // gone wrong already. - assert(!should_flush_); + UpdateFlushState(); + // something went wrong if we need to flush before inserting anything + assert(!ShouldScheduleFlush()); + if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) { prefix_bloom_.reset(new DynamicBloom( - &allocator_, - moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality, - moptions_.memtable_prefix_bloom_probes, nullptr, - moptions_.memtable_prefix_bloom_huge_page_tlb_size, - ioptions.info_log)); + &allocator_, moptions_.memtable_prefix_bloom_bits, + ioptions.bloom_locality, 6 /* hard coded 6 probes */, nullptr, + moptions_.memtable_huge_page_size, ioptions.info_log)); } } @@ -166,6 +169,17 @@ bool MemTable::ShouldFlushNow() const { return arena_.AllocatedAndUnused() < kArenaBlockSize / 4; } +void MemTable::UpdateFlushState() { + auto state = flush_state_.load(std::memory_order_relaxed); + if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) { + // ignore CAS failure, because that means somebody else requested + // a flush + flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } +} + int MemTable::KeyComparator::operator()(const char* prefix_len_key1, const char* prefix_len_key2) const { // Internal keys are encoded as length-prefixed strings. @@ -202,14 +216,15 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { return scratch->data(); } -class MemTableIterator: public Iterator { +class MemTableIterator : public InternalIterator { public: - MemTableIterator( - const MemTable& mem, const ReadOptions& read_options, Arena* arena) + MemTableIterator(const MemTable& mem, const ReadOptions& read_options, + Arena* arena) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), valid_(false), - arena_mode_(arena != nullptr) { + arena_mode_(arena != nullptr), + value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) { if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { bloom_ = mem.prefix_bloom_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); @@ -219,6 +234,12 @@ class MemTableIterator: public Iterator { } ~MemTableIterator() { +#ifndef NDEBUG + // Assert that the MemTableIterator is never deleted while + // Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); +#endif if (arena_mode_) { iter_->~Iterator(); } else { @@ -226,6 +247,14 @@ class MemTableIterator: public Iterator { } } +#ifndef NDEBUG + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + virtual bool Valid() const override { return valid_; } virtual void Seek(const Slice& k) override { PERF_TIMER_GUARD(seek_on_memtable_time); @@ -273,19 +302,31 @@ class MemTableIterator: public Iterator { virtual Status status() const override { return Status::OK(); } + virtual bool IsKeyPinned() const override { + // memtable data is always pinned + return true; + } + + virtual bool IsValuePinned() const override { + // memtable value is always pinned, except if we allow inplace update. + return value_pinned_; + } + private: DynamicBloom* bloom_; const SliceTransform* const prefix_extractor_; MemTableRep::Iterator* iter_; bool valid_; bool arena_mode_; + bool value_pinned_; // No copying allowed MemTableIterator(const MemTableIterator&); void operator=(const MemTableIterator&); }; -Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) { +InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, + Arena* arena) { assert(arena != nullptr); auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); return new (mem) MemTableIterator(*this, read_options, arena); @@ -318,7 +359,8 @@ uint64_t MemTable::ApproximateSize(const Slice& start_ikey, void MemTable::Add(SequenceNumber s, ValueType type, const Slice& key, /* user key */ - const Slice& value) { + const Slice& value, bool allow_concurrent, + MemTablePostProcessInfo* post_process_info) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() // key bytes : char[internal_key.size()] @@ -332,7 +374,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, val_size; char* buf = nullptr; KeyHandle handle = table_->Allocate(encoded_len, &buf); - assert(buf != nullptr); + char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; @@ -342,32 +384,65 @@ void MemTable::Add(SequenceNumber s, ValueType type, p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); - table_->Insert(handle); - num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, + if (!allow_concurrent) { + table_->Insert(handle); + + // this is a bit ugly, but is the way to avoid locked instructions + // when incrementing an atomic + num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, std::memory_order_relaxed); - data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, - std::memory_order_relaxed); - if (type == kTypeDeletion) { - num_deletes_++; - } + if (type == kTypeDeletion) { + num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + } - if (prefix_bloom_) { - assert(prefix_extractor_); - prefix_bloom_->Add(prefix_extractor_->Transform(key)); - } + if (prefix_bloom_) { + assert(prefix_extractor_); + prefix_bloom_->Add(prefix_extractor_->Transform(key)); + } - // The first sequence number inserted into the memtable - assert(first_seqno_ == 0 || s > first_seqno_); - if (first_seqno_ == 0) { - first_seqno_ = s; + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s > first_seqno_); + if (first_seqno_ == 0) { + first_seqno_.store(s, std::memory_order_relaxed); - if (earliest_seqno_ == kMaxSequenceNumber) { - earliest_seqno_ = first_seqno_; + if (earliest_seqno_ == kMaxSequenceNumber) { + earliest_seqno_.store(GetFirstSequenceNumber(), + std::memory_order_relaxed); + } + assert(first_seqno_.load() >= earliest_seqno_.load()); + } + assert(post_process_info == nullptr); + UpdateFlushState(); + } else { + table_->InsertConcurrently(handle); + + assert(post_process_info != nullptr); + post_process_info->num_entries++; + post_process_info->data_size += encoded_len; + if (type == kTypeDeletion) { + post_process_info->num_deletes++; + } + + if (prefix_bloom_) { + assert(prefix_extractor_); + prefix_bloom_->AddConcurrently(prefix_extractor_->Transform(key)); } - assert(first_seqno_ >= earliest_seqno_); - } - should_flush_ = ShouldFlushNow(); + // atomically update first_seqno_ and earliest_seqno_. + uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed); + while ((cur_seq_num == 0 || s < cur_seq_num) && + !first_seqno_.compare_exchange_weak(cur_seq_num, s)) { + } + uint64_t cur_earliest_seqno = + earliest_seqno_.load(std::memory_order_relaxed); + while ( + (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && + !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { + } + } } // Callback from MemTable::Get() @@ -424,23 +499,11 @@ static bool SaveValue(void* arg, const char* entry) { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); *(s->status) = Status::OK(); if (*(s->merge_in_progress)) { - assert(merge_operator); - bool merge_success = false; - { - StopWatchNano timer(s->env_, s->statistics != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - merge_success = merge_operator->FullMerge( - s->key->user_key(), &v, merge_context->GetOperands(), s->value, - s->logger); - RecordTick(s->statistics, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanos()); - } - if (!merge_success) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - *(s->status) = - Status::Corruption("Error: Could not perform merge."); - } - } else { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &v, + merge_context->GetOperands(), s->value, s->logger, s->statistics, + s->env_); + } else if (s->value != nullptr) { s->value->assign(v.data(), v.size()); } if (s->inplace_update_support) { @@ -452,23 +515,10 @@ static bool SaveValue(void* arg, const char* entry) { case kTypeDeletion: case kTypeSingleDeletion: { if (*(s->merge_in_progress)) { - assert(merge_operator != nullptr); - *(s->status) = Status::OK(); - bool merge_success = false; - { - StopWatchNano timer(s->env_, s->statistics != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - merge_success = merge_operator->FullMerge( - s->key->user_key(), nullptr, merge_context->GetOperands(), - s->value, s->logger); - RecordTick(s->statistics, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanos()); - } - if (!merge_success) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - *(s->status) = - Status::Corruption("Error: Could not perform merge."); - } + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), s->value, s->logger, s->statistics, + s->env_); } else { *(s->status) = Status::NotFound(); } @@ -488,7 +538,8 @@ static bool SaveValue(void* arg, const char* entry) { } Slice v = GetLengthPrefixedSlice(key_ptr + key_length); *(s->merge_in_progress) = true; - merge_context->PushOperand(v); + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); return true; } default: @@ -668,16 +719,16 @@ bool MemTable::UpdateCallback(SequenceNumber seq, } } RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); - should_flush_ = ShouldFlushNow(); + UpdateFlushState(); return true; } else if (status == UpdateStatus::UPDATED) { Add(seq, kTypeValue, key, Slice(str_value)); RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); - should_flush_ = ShouldFlushNow(); + UpdateFlushState(); return true; } else if (status == UpdateStatus::UPDATE_FAILED) { // No action required. Return. - should_flush_ = ShouldFlushNow(); + UpdateFlushState(); return true; } } @@ -735,4 +786,17 @@ void MemTableRep::Get(const LookupKey& k, void* callback_args, } } +void MemTable::RefLogContainingPrepSection(uint64_t log) { + assert(log > 0); + auto cur = min_prep_log_referenced_.load(); + while ((log < cur || cur == 0) && + !min_prep_log_referenced_.compare_exchange_strong(cur, log)) { + cur = min_prep_log_referenced_.load(); + } +} + +uint64_t MemTable::GetMinLogContainingPrepSection() { + return min_prep_log_referenced_.load(); +} + } // namespace rocksdb diff --git a/external/rocksdb/db/memtable.h b/external/rocksdb/db/memtable.h index f09082ce01..62931613e2 100644 --- a/external/rocksdb/db/memtable.h +++ b/external/rocksdb/db/memtable.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,10 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include -#include -#include +#include #include +#include +#include +#include #include #include "db/dbformat.h" #include "db/skiplist.h" @@ -21,8 +22,9 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/immutable_options.h" #include "db/memtable_allocator.h" -#include "util/arena.h" +#include "util/concurrent_arena.h" #include "util/dynamic_bloom.h" +#include "util/instrumented_mutex.h" #include "util/mutable_cf_options.h" namespace rocksdb { @@ -30,7 +32,7 @@ namespace rocksdb { class Mutex; class MemTableIterator; class MergeContext; -class WriteBuffer; +class InternalIterator; struct MemTableOptions { explicit MemTableOptions( @@ -39,8 +41,7 @@ struct MemTableOptions { size_t write_buffer_size; size_t arena_block_size; uint32_t memtable_prefix_bloom_bits; - uint32_t memtable_prefix_bloom_probes; - size_t memtable_prefix_bloom_huge_page_tlb_size; + size_t memtable_huge_page_size; bool inplace_update_support; size_t inplace_update_num_locks; UpdateStatus (*inplace_callback)(char* existing_value, @@ -48,12 +49,20 @@ struct MemTableOptions { Slice delta_value, std::string* merged_value); size_t max_successive_merges; - bool filter_deletes; Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; }; +// Batched counters to updated when inserting keys in one write batch. +// In post process of the write batch, these can be updated together. +// Only used in concurrent memtable insert case. +struct MemTablePostProcessInfo { + uint64_t data_size = 0; + uint64_t num_entries = 0; + uint64_t num_deletes = 0; +}; + // Note: Many of the methods in this class have comments indicating that // external synchromization is required as these methods are not thread-safe. // It is up to higher layers of code to decide how to prevent concurrent @@ -90,7 +99,8 @@ class MemTable { explicit MemTable(const InternalKeyComparator& comparator, const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, - WriteBuffer* write_buffer, SequenceNumber earliest_seq); + WriteBufferManager* write_buffer_manager, + SequenceNumber earliest_seq); // Do not delete this MemTable unless Unref() indicates it not in use. ~MemTable(); @@ -123,10 +133,17 @@ class MemTable { // This method heuristically determines if the memtable should continue to // host more data. bool ShouldScheduleFlush() const { - return flush_scheduled_ == false && should_flush_; + return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED; } - void MarkFlushScheduled() { flush_scheduled_ = true; } + // Returns true if a flush should be scheduled and the caller should + // be the one to schedule it + bool MarkFlushScheduled() { + auto before = FLUSH_REQUESTED; + return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } // Return an iterator that yields the contents of the memtable. // @@ -140,17 +157,17 @@ class MemTable { // arena: If not null, the arena needs to be used to allocate the Iterator. // Calling ~Iterator of the iterator will destroy all the states but // those allocated in arena. - Iterator* NewIterator(const ReadOptions& read_options, Arena* arena); + InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. // Typically value will be empty if type==kTypeDeletion. // - // REQUIRES: external synchronization to prevent simultaneous - // operations on the same MemTable. - void Add(SequenceNumber seq, ValueType type, - const Slice& key, - const Slice& value); + // REQUIRES: if allow_concurrent = false, external synchronization to prevent + // simultaneous operations on the same MemTable. + void Add(SequenceNumber seq, ValueType type, const Slice& key, + const Slice& value, bool allow_concurrent = false, + MemTablePostProcessInfo* post_process_info = nullptr); // If memtable contains a value for key, store it in *value and return true. // If memtable contains a deletion for key, store a NotFound() error @@ -188,7 +205,7 @@ class MemTable { const Slice& key, const Slice& value); - // If prev_value for key exits, attempts to update it inplace. + // If prev_value for key exists, attempts to update it inplace. // else returns false // Pseudocode // if key exists in current memtable && prev_value is of type kTypeValue @@ -209,6 +226,19 @@ class MemTable { // key in the memtable. size_t CountSuccessiveMergeEntries(const LookupKey& key); + // Update counters and flush status after inserting a whole write batch + // Used in concurrent memtable inserts. + void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { + num_entries_.fetch_add(update_counters.num_entries, + std::memory_order_relaxed); + data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); + if (update_counters.num_deletes != 0) { + num_deletes_.fetch_add(update_counters.num_deletes, + std::memory_order_relaxed); + } + UpdateFlushState(); + } + // Get total number of entries in the mem table. // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable (unless this Memtable is immutable). @@ -219,7 +249,9 @@ class MemTable { // Get total number of deletes in the mem table. // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable (unless this Memtable is immutable). - uint64_t num_deletes() const { return num_deletes_; } + uint64_t num_deletes() const { + return num_deletes_.load(std::memory_order_relaxed); + } // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } @@ -233,7 +265,9 @@ class MemTable { // into the memtable. // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable (unless this Memtable is immutable). - SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } + SequenceNumber GetFirstSequenceNumber() { + return first_seqno_.load(std::memory_order_relaxed); + } // Returns the sequence number that is guaranteed to be smaller than or equal // to the sequence number of any key that could be inserted into this @@ -242,7 +276,9 @@ class MemTable { // // If the earliest sequence number could not be determined, // kMaxSequenceNumber will be returned. - SequenceNumber GetEarliestSequenceNumber() { return earliest_seqno_; } + SequenceNumber GetEarliestSequenceNumber() { + return earliest_seqno_.load(std::memory_order_relaxed); + } // Returns the next active logfile number when this memtable is about to // be flushed to storage @@ -256,6 +292,13 @@ class MemTable { // operations on the same MemTable. void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + // if this memtable contains data from a committed + // two phase transaction we must take note of the + // log which contains that data so we can know + // when to relese that log + void RefLogContainingPrepSection(uint64_t log); + uint64_t GetMinLogContainingPrepSection(); + // Notify the underlying storage that no more items will be added. // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. @@ -289,8 +332,7 @@ class MemTable { const MemTableOptions* GetMemTableOptions() const { return &moptions_; } private: - // Dynamically check if we can add more incoming entries - bool ShouldFlushNow() const; + enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; friend class MemTableIterator; friend class MemTableBackwardIterator; @@ -300,14 +342,14 @@ class MemTable { const MemTableOptions moptions_; int refs_; const size_t kArenaBlockSize; - Arena arena_; + ConcurrentArena arena_; MemTableAllocator allocator_; unique_ptr table_; // Total data size of all data inserted std::atomic data_size_; std::atomic num_entries_; - uint64_t num_deletes_; + std::atomic num_deletes_; // These are used to manage memtable flushes to storage bool flush_in_progress_; // started the flush @@ -319,31 +361,38 @@ class MemTable { VersionEdit edit_; // The sequence number of the kv that was inserted first - SequenceNumber first_seqno_; + std::atomic first_seqno_; // The db sequence number at the time of creation or kMaxSequenceNumber // if not set. - SequenceNumber earliest_seqno_; + std::atomic earliest_seqno_; // The log files earlier than this number can be deleted. uint64_t mem_next_logfile_number_; + // the earliest log containing a prepared section + // which has been inserted into this memtable. + std::atomic min_prep_log_referenced_; + // rw locks for inplace updates std::vector locks_; - // No copying allowed - MemTable(const MemTable&); - void operator=(const MemTable&); - const SliceTransform* const prefix_extractor_; std::unique_ptr prefix_bloom_; - // a flag indicating if a memtable has met the criteria to flush - bool should_flush_; + std::atomic flush_state_; - // a flag indicating if flush has been scheduled - bool flush_scheduled_; Env* env_; + + // Returns a heuristic flush decision + bool ShouldFlushNow() const; + + // Updates flush_state_ using ShouldFlushNow() + void UpdateFlushState(); + + // No copying allowed + MemTable(const MemTable&); + MemTable& operator=(const MemTable&); }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/external/rocksdb/db/memtable_allocator.cc b/external/rocksdb/db/memtable_allocator.cc index d3ecea2fde..9a7204dd88 100644 --- a/external/rocksdb/db/memtable_allocator.cc +++ b/external/rocksdb/db/memtable_allocator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,46 +7,53 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include - #include "db/memtable_allocator.h" -#include "db/writebuffer.h" + +#include +#include "rocksdb/write_buffer_manager.h" #include "util/arena.h" namespace rocksdb { -MemTableAllocator::MemTableAllocator(Arena* arena, WriteBuffer* write_buffer) - : arena_(arena), write_buffer_(write_buffer), bytes_allocated_(0) { -} +MemTableAllocator::MemTableAllocator(Allocator* allocator, + WriteBufferManager* write_buffer_manager) + : allocator_(allocator), + write_buffer_manager_(write_buffer_manager), + bytes_allocated_(0) {} -MemTableAllocator::~MemTableAllocator() { - DoneAllocating(); -} +MemTableAllocator::~MemTableAllocator() { DoneAllocating(); } char* MemTableAllocator::Allocate(size_t bytes) { - assert(write_buffer_ != nullptr); - bytes_allocated_ += bytes; - write_buffer_->ReserveMem(bytes); - return arena_->Allocate(bytes); + assert(write_buffer_manager_ != nullptr); + if (write_buffer_manager_->enabled()) { + bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed); + write_buffer_manager_->ReserveMem(bytes); + } + return allocator_->Allocate(bytes); } char* MemTableAllocator::AllocateAligned(size_t bytes, size_t huge_page_size, Logger* logger) { - assert(write_buffer_ != nullptr); - bytes_allocated_ += bytes; - write_buffer_->ReserveMem(bytes); - return arena_->AllocateAligned(bytes, huge_page_size, logger); + assert(write_buffer_manager_ != nullptr); + if (write_buffer_manager_->enabled()) { + bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed); + write_buffer_manager_->ReserveMem(bytes); + } + return allocator_->AllocateAligned(bytes, huge_page_size, logger); } void MemTableAllocator::DoneAllocating() { - if (write_buffer_ != nullptr) { - write_buffer_->FreeMem(bytes_allocated_); - write_buffer_ = nullptr; + if (write_buffer_manager_ != nullptr) { + if (write_buffer_manager_->enabled()) { + write_buffer_manager_->FreeMem( + bytes_allocated_.load(std::memory_order_relaxed)); + } else { + assert(bytes_allocated_.load(std::memory_order_relaxed) == 0); + } + write_buffer_manager_ = nullptr; } } -size_t MemTableAllocator::BlockSize() const { - return arena_->BlockSize(); -} +size_t MemTableAllocator::BlockSize() const { return allocator_->BlockSize(); } } // namespace rocksdb diff --git a/external/rocksdb/db/memtable_allocator.h b/external/rocksdb/db/memtable_allocator.h index fa8ee12871..050e13b365 100644 --- a/external/rocksdb/db/memtable_allocator.h +++ b/external/rocksdb/db/memtable_allocator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,20 +8,23 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. // // This is used by the MemTable to allocate write buffer memory. It connects -// to WriteBuffer so we can track and enforce overall write buffer limits. +// to WriteBufferManager so we can track and enforce overall write buffer +// limits. #pragma once + +#include +#include "rocksdb/write_buffer_manager.h" #include "util/allocator.h" namespace rocksdb { -class Arena; class Logger; -class WriteBuffer; class MemTableAllocator : public Allocator { public: - explicit MemTableAllocator(Arena* arena, WriteBuffer* write_buffer); + explicit MemTableAllocator(Allocator* allocator, + WriteBufferManager* write_buffer_manager); ~MemTableAllocator(); // Allocator interface @@ -35,9 +38,9 @@ class MemTableAllocator : public Allocator { void DoneAllocating(); private: - Arena* arena_; - WriteBuffer* write_buffer_; - size_t bytes_allocated_; + Allocator* allocator_; + WriteBufferManager* write_buffer_manager_; + std::atomic bytes_allocated_; // No copying allowed MemTableAllocator(const MemTableAllocator&); diff --git a/external/rocksdb/db/memtable_list.cc b/external/rocksdb/db/memtable_list.cc index b2bbbd1659..799887ed1d 100644 --- a/external/rocksdb/db/memtable_list.cc +++ b/external/rocksdb/db/memtable_list.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,14 +11,15 @@ #include #include -#include "rocksdb/db.h" #include "db/memtable.h" #include "db/version_set.h" +#include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "table/merger.h" #include "util/coding.h" #include "util/log_buffer.h" +#include "util/sync_point.h" #include "util/thread_status_util.h" namespace rocksdb { @@ -138,9 +139,9 @@ bool MemTableListVersion::GetFromList(std::list* list, return false; } -void MemTableListVersion::AddIterators(const ReadOptions& options, - std::vector* iterator_list, - Arena* arena) { +void MemTableListVersion::AddIterators( + const ReadOptions& options, std::vector* iterator_list, + Arena* arena) { for (auto& m : memlist_) { iterator_list->push_back(m->NewIterator(options, arena)); } @@ -297,56 +298,81 @@ Status MemTableList::InstallMemtableFlushResults( // if some other thread is already committing, then return Status s; if (commit_in_progress_) { + TEST_SYNC_POINT("MemTableList::InstallMemtableFlushResults:InProgress"); return s; } // Only a single thread can be executing this piece of code commit_in_progress_ = true; - // scan all memtables from the earliest, and commit those - // (in that order) that have finished flushing. Memetables - // are always committed in the order that they were created. - while (!current_->memlist_.empty() && s.ok()) { - MemTable* m = current_->memlist_.back(); // get the last element - if (!m->flush_completed_) { + // Retry until all completed flushes are committed. New flushes can finish + // while the current thread is writing manifest where mutex is released. + while (s.ok()) { + auto& memlist = current_->memlist_; + if (memlist.empty() || !memlist.back()->flush_completed_) { break; } + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memetables + // are always committed in the order that they were created. + uint64_t batch_file_number = 0; + size_t batch_count = 0; + autovector edit_list; + // enumerate from the last (earliest) element to see how many batch finished + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_completed_) { + break; + } + if (it == memlist.rbegin() || batch_file_number != m->file_number_) { + batch_file_number = m->file_number_; + LogToBuffer(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); + edit_list.push_back(&m->edit_); + } + batch_count++; + } - LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started", - cfd->GetName().c_str(), m->file_number_); - - // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory); - - // we will be changing the version in the next code path, - // so we better create a new one, since versions are immutable - InstallNewVersion(); - - // All the later memtables that have the same filenum - // are part of the same batch. They can be committed now. - uint64_t mem_id = 1; // how many memtables have been flushed. - do { - if (s.ok()) { // commit new state - LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " done", - cfd->GetName().c_str(), m->file_number_, mem_id); - assert(m->file_number_ > 0); - current_->Remove(m, to_delete); + if (batch_count > 0) { + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, + db_directory); + + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + if (s.ok()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + ++mem_id; + } } else { - // commit failed. setup state so that we can flush again. - LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " failed", - m->file_number_, mem_id); - m->flush_completed_ = false; - m->flush_in_progress_ = false; - m->edit_.Clear(); - num_flush_not_started_++; - m->file_number_ = 0; - imm_flush_needed.store(true, std::memory_order_release); + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; it++) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + m->file_number_, mem_id); + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } } - ++mem_id; - } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) && - m->file_number_ == file_number); + } } commit_in_progress_ = false; return s; @@ -392,4 +418,24 @@ void MemTableList::InstallNewVersion() { } } +uint64_t MemTableList::GetMinLogContainingPrepSection() { + uint64_t min_log = 0; + + for (auto& m : current_->memlist_) { + // this mem has been flushed it no longer + // needs to hold on the its prep section + if (m->flush_completed_) { + continue; + } + + auto log = m->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + } // namespace rocksdb diff --git a/external/rocksdb/db/memtable_list.h b/external/rocksdb/db/memtable_list.h index 63e27732b2..7f633f026b 100644 --- a/external/rocksdb/db/memtable_list.h +++ b/external/rocksdb/db/memtable_list.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -74,7 +74,8 @@ class MemTableListVersion { } void AddIterators(const ReadOptions& options, - std::vector* iterator_list, Arena* arena); + std::vector* iterator_list, + Arena* arena); void AddIterators(const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder); @@ -214,6 +215,8 @@ class MemTableList { size_t* current_memory_usage() { return ¤t_memory_usage_; } + uint64_t GetMinLogContainingPrepSection(); + private: // DB mutex held void InstallNewVersion(); diff --git a/external/rocksdb/db/memtable_list_test.cc b/external/rocksdb/db/memtable_list_test.cc index 7bb8b3b21a..42420b876c 100644 --- a/external/rocksdb/db/memtable_list_test.cc +++ b/external/rocksdb/db/memtable_list_test.cc @@ -1,21 +1,21 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include "db/memtable_list.h" #include #include #include -#include "db/memtable_list.h" #include "db/merge_context.h" #include "db/version_set.h" #include "db/write_controller.h" -#include "db/writebuffer.h" #include "rocksdb/db.h" #include "rocksdb/status.h" -#include "util/testutil.h" +#include "rocksdb/write_buffer_manager.h" #include "util/string_util.h" #include "util/testharness.h" +#include "util/testutil.h" namespace rocksdb { @@ -59,12 +59,12 @@ class MemTableListTest : public testing::Test { DBOptions db_options; EnvOptions env_options; shared_ptr table_cache(NewLRUCache(50000, 16)); - WriteBuffer write_buffer(db_options.db_write_buffer_size); + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); WriteController write_controller(10000000u); CreateDB(); VersionSet versions(dbname, &db_options, env_options, table_cache.get(), - &write_buffer, &write_controller); + &write_buffer_manager, &write_controller); // Create mock default ColumnFamilyData ColumnFamilyOptions cf_options; @@ -126,7 +126,7 @@ TEST_F(MemTableListTest, GetTest) { options.memtable_factory = factory; ImmutableCFOptions ioptions(options); - WriteBuffer wb(options.db_write_buffer_size); + WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb, kMaxSequenceNumber); @@ -163,7 +163,7 @@ TEST_F(MemTableListTest, GetTest) { SequenceNumber saved_seq = seq; // Create another memtable and write some keys to it - WriteBuffer wb2(options.db_write_buffer_size); + WriteBufferManager wb2(options.db_write_buffer_size); MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb2, kMaxSequenceNumber); @@ -228,7 +228,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { options.memtable_factory = factory; ImmutableCFOptions ioptions(options); - WriteBuffer wb(options.db_write_buffer_size); + WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb, kMaxSequenceNumber); @@ -303,7 +303,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_EQ("value2.2", value); // Create another memtable and write some keys to it - WriteBuffer wb2(options.db_write_buffer_size); + WriteBufferManager wb2(options.db_write_buffer_size); MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb2, kMaxSequenceNumber); @@ -329,7 +329,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { ASSERT_EQ(0, to_delete.size()); // Add a third memtable to push the first memtable out of the history - WriteBuffer wb3(options.db_write_buffer_size); + WriteBufferManager wb3(options.db_write_buffer_size); MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb3, kMaxSequenceNumber); @@ -390,7 +390,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { options.memtable_factory = factory; ImmutableCFOptions ioptions(options); InternalKeyComparator cmp(BytewiseComparator()); - WriteBuffer wb(options.db_write_buffer_size); + WriteBufferManager wb(options.db_write_buffer_size); autovector to_delete; // Create MemTableList diff --git a/external/rocksdb/db/memtablerep_bench.cc b/external/rocksdb/db/memtablerep_bench.cc index a2a8722269..b5875618b9 100644 --- a/external/rocksdb/db/memtablerep_bench.cc +++ b/external/rocksdb/db/memtablerep_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -28,13 +28,13 @@ int main() { #include "db/dbformat.h" #include "db/memtable.h" -#include "db/writebuffer.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" #include "rocksdb/memtablerep.h" #include "rocksdb/options.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/write_buffer_manager.h" #include "util/arena.h" #include "util/mutexlock.h" #include "util/stop_watch.h" @@ -132,8 +132,6 @@ DEFINE_int64(seed, 0, "Seed base for random number generators. " "When 0 it is deterministic."); -static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); - namespace rocksdb { namespace { @@ -592,6 +590,7 @@ int main(int argc, char** argv) { std::unique_ptr factory; if (FLAGS_memtablerep == "skiplist") { factory.reset(new rocksdb::SkipListFactory); +#ifndef ROCKSDB_LITE } else if (FLAGS_memtablerep == "vector") { factory.reset(new rocksdb::VectorRepFactory); } else if (FLAGS_memtablerep == "hashskiplist") { @@ -613,6 +612,7 @@ int main(int argc, char** argv) { static_cast(FLAGS_hash_function_count))); options.prefix_extractor.reset( rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length)); +#endif // ROCKSDB_LITE } else { fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str()); exit(1); @@ -622,7 +622,7 @@ int main(int argc, char** argv) { rocksdb::BytewiseComparator()); rocksdb::MemTable::KeyComparator key_comp(internal_key_comp); rocksdb::Arena arena; - rocksdb::WriteBuffer wb(FLAGS_write_buffer_size); + rocksdb::WriteBufferManager wb(FLAGS_write_buffer_size); rocksdb::MemTableAllocator memtable_allocator(&arena, &wb); uint64_t sequence; auto createMemtableRep = [&] { diff --git a/external/rocksdb/db/merge_context.h b/external/rocksdb/db/merge_context.h index f8609da75e..bf8bff0b6e 100644 --- a/external/rocksdb/db/merge_context.h +++ b/external/rocksdb/db/merge_context.h @@ -1,68 +1,116 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #pragma once +#include +#include #include "db/dbformat.h" #include "rocksdb/slice.h" -#include -#include namespace rocksdb { -const std::deque empty_operand_list; +const std::vector empty_operand_list; // The merge context for merging a user key. // When doing a Get(), DB will create such a class and pass it when // issuing Get() operation to memtables and version_set. The operands // will be fetched from the context when issuing partial of full merge. class MergeContext { -public: + public: // Clear all the operands void Clear() { - if (operand_list) { - operand_list->clear(); + if (operand_list_) { + operand_list_->clear(); + copied_operands_->clear(); } } - // Replace all operands with merge_result, which are expected to be the - // merge result of them. - void PushPartialMergeResult(std::string& merge_result) { - assert (operand_list); - operand_list->clear(); - operand_list->push_front(std::move(merge_result)); - } + // Push a merge operand - void PushOperand(const Slice& operand_slice) { + void PushOperand(const Slice& operand_slice, bool operand_pinned = false) { + Initialize(); + SetDirectionBackward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // Push back a merge operand + void PushOperandBack(const Slice& operand_slice, + bool operand_pinned = false) { Initialize(); - operand_list->push_front(operand_slice.ToString()); + SetDirectionForward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } } + // return total number of operands in the list size_t GetNumOperands() const { - if (!operand_list) { + if (!operand_list_) { return 0; } - return operand_list->size(); + return operand_list_->size(); } + // Get the operand at the index. - Slice GetOperand(int index) const { - assert (operand_list); - return (*operand_list)[index]; + Slice GetOperand(int index) { + assert(operand_list_); + + SetDirectionForward(); + return (*operand_list_)[index]; } + // Return all the operands. - const std::deque& GetOperands() const { - if (!operand_list) { + const std::vector& GetOperands() { + if (!operand_list_) { return empty_operand_list; } - return *operand_list; + + SetDirectionForward(); + return *operand_list_; } -private: + + private: void Initialize() { - if (!operand_list) { - operand_list.reset(new std::deque()); + if (!operand_list_) { + operand_list_.reset(new std::vector()); + copied_operands_.reset(new std::vector>()); + } + } + + void SetDirectionForward() { + if (operands_reversed_ == true) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = false; + } + } + + void SetDirectionBackward() { + if (operands_reversed_ == false) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = true; } } - std::unique_ptr> operand_list; + + // List of operands + std::unique_ptr> operand_list_; + // Copy of operands that are not pinned. + std::unique_ptr>> copied_operands_; + bool operands_reversed_ = true; }; } // namespace rocksdb diff --git a/external/rocksdb/db/merge_helper.cc b/external/rocksdb/db/merge_helper.cc index f9cb67e9cf..a3d3823fc9 100644 --- a/external/rocksdb/db/merge_helper.cc +++ b/external/rocksdb/db/merge_helper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,35 +12,53 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" +#include "table/internal_iterator.h" #include "util/perf_context_imp.h" #include "util/statistics.h" namespace rocksdb { -// TODO(agiardullo): Clean up merge callsites to use this func -Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value, - const std::deque& operands, - const MergeOperator* merge_operator, +Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, Statistics* statistics, Env* env, - Logger* logger, std::string* result) { + Slice* result_operand) { + assert(merge_operator != nullptr); + if (operands.size() == 0) { + assert(value != nullptr && result != nullptr); result->assign(value->data(), value->size()); return Status::OK(); } - if (merge_operator == nullptr) { - return Status::NotSupported("Provide a merge_operator when opening DB"); - } + bool success; + Slice tmp_result_operand(nullptr, 0); + const MergeOperator::MergeOperationInput merge_in(key, value, operands, + logger); + MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); + { + // Setup to time the merge + StopWatchNano timer(env, statistics != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); - // Setup to time the merge - StopWatchNano timer(env, statistics != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); + // Do the merge + success = merge_operator->FullMergeV2(merge_in, &merge_out); - // Do the merge - bool success = - merge_operator->FullMerge(key, value, operands, result, logger); + if (tmp_result_operand.data()) { + // FullMergeV2 result is an existing operand + if (result_operand != nullptr) { + *result_operand = tmp_result_operand; + } else { + result->assign(tmp_result_operand.data(), tmp_result_operand.size()); + } + } else if (result_operand) { + *result_operand = Slice(nullptr, 0); + } - RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanosSafe()); + RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, + statistics ? timer.ElapsedNanos() : 0); + } if (!success) { RecordTick(statistics, NUMBER_MERGE_FAILURES); @@ -56,13 +74,14 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value, // keys_ stores the list of keys encountered while merging. // operands_ stores the list of merge operands encountered while merging. // keys_[i] corresponds to operands_[i] for each i. -Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, +Status MergeHelper::MergeUntil(InternalIterator* iter, + const SequenceNumber stop_before, const bool at_bottom) { // Get a copy of the internal key, before it's invalidated by iter->Next() // Also maintain the list of merge operands seen. assert(HasOperator()); keys_.clear(); - operands_.clear(); + merge_context_.Clear(); assert(user_merge_operator_); bool first_key = true; @@ -84,7 +103,7 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, bool hit_the_next_user_key = false; for (; iter->Valid(); iter->Next(), original_key_is_iter = false) { ParsedInternalKey ikey; - assert(keys_.size() == operands_.size()); + assert(keys_.size() == merge_context_.GetNumOperands()); if (!ParseInternalKey(iter->key(), &ikey)) { // stop at corrupted key @@ -138,9 +157,9 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, const Slice val = iter->value(); const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr; std::string merge_result; - s = TimedFullMerge(ikey.user_key, val_ptr, operands_, - user_merge_operator_, stats_, env_, logger_, - &merge_result); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, env_); // We store the result in keys_.back() and operands_.back() // if nothing went wrong (i.e.: no operand corruption on disk) @@ -150,9 +169,9 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, orig_ikey.type = kTypeValue; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); keys_.clear(); - operands_.clear(); + merge_context_.Clear(); keys_.emplace_front(std::move(original_key)); - operands_.emplace_front(std::move(merge_result)); + merge_context_.PushOperand(merge_result); } // move iter to the next entry @@ -186,12 +205,13 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, // original_key before ParseInternalKey(keys_.back(), &orig_ikey); } - operands_.push_front(value_slice.ToString()); + merge_context_.PushOperand(value_slice, + iter->IsValuePinned() /* operand_pinned */); } } } - if (operands_.size() == 0) { + if (merge_context_.GetNumOperands() == 0) { // we filtered out all the merge operands return Status::OK(); } @@ -216,12 +236,12 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, // do a final merge with nullptr as the existing value and say // bye to the merge type (it's now converted to a Put) assert(kTypeMerge == orig_ikey.type); - assert(operands_.size() >= 1); - assert(operands_.size() == keys_.size()); + assert(merge_context_.GetNumOperands() >= 1); + assert(merge_context_.GetNumOperands() == keys_.size()); std::string merge_result; - s = TimedFullMerge(orig_ikey.user_key, nullptr, operands_, - user_merge_operator_, stats_, env_, logger_, - &merge_result); + s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, env_); if (s.ok()) { // The original key encountered // We are certain that keys_ is not empty here (see assertions couple of @@ -230,9 +250,9 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, orig_ikey.type = kTypeValue; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); keys_.clear(); - operands_.clear(); + merge_context_.Clear(); keys_.emplace_front(std::move(original_key)); - operands_.emplace_front(std::move(merge_result)); + merge_context_.PushOperand(merge_result); } } else { // We haven't seen the beginning of the key nor a Put/Delete. @@ -243,8 +263,8 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, // partial merge returns Status::OK(). Should we change the status code // after a successful partial merge? s = Status::MergeInProgress(); - if (operands_.size() >= 2 && - operands_.size() >= min_partial_merge_operands_) { + if (merge_context_.GetNumOperands() >= 2 && + merge_context_.GetNumOperands() >= min_partial_merge_operands_) { bool merge_success = false; std::string merge_result; { @@ -252,16 +272,17 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, PERF_TIMER_GUARD(merge_operator_time_nanos); merge_success = user_merge_operator_->PartialMergeMulti( orig_ikey.user_key, - std::deque(operands_.begin(), operands_.end()), + std::deque(merge_context_.GetOperands().begin(), + merge_context_.GetOperands().end()), &merge_result, logger_); RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanosSafe()); + stats_ ? timer.ElapsedNanosSafe() : 0); } if (merge_success) { // Merging of operands (associative merge) was successful. // Replace operands with the merge result - operands_.clear(); - operands_.emplace_front(std::move(merge_result)); + merge_context_.Clear(); + merge_context_.PushOperand(merge_result); keys_.erase(keys_.begin(), keys_.end() - 1); } } diff --git a/external/rocksdb/db/merge_helper.h b/external/rocksdb/db/merge_helper.h index ade3d71a6c..7cd992f34f 100644 --- a/external/rocksdb/db/merge_helper.h +++ b/external/rocksdb/db/merge_helper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,8 +8,10 @@ #include #include +#include #include "db/dbformat.h" +#include "db/merge_context.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" @@ -22,6 +24,7 @@ class Iterator; class Logger; class MergeOperator; class Statistics; +class InternalIterator; class MergeHelper { public: @@ -41,25 +44,24 @@ class MergeHelper { latest_snapshot_(latest_snapshot), level_(level), keys_(), - operands_(), filter_timer_(env_), total_filter_time_(0U), stats_(stats) { assert(user_comparator_ != nullptr); } - // Wrapper around MergeOperator::FullMerge() that records perf statistics. + // Wrapper around MergeOperator::FullMergeV2() that records perf statistics. // Result of merge will be written to result if status returned is OK. // If operands is empty, the value will simply be copied to result. // Returns one of the following statuses: // - OK: Entries were successfully merged. // - Corruption: Merge operator reported unsuccessful merge. - // - NotSupported: Merge operator is missing. - static Status TimedFullMerge(const Slice& key, const Slice* value, - const std::deque& operands, - const MergeOperator* merge_operator, - Statistics* statistics, Env* env, Logger* logger, - std::string* result); + static Status TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, + Statistics* statistics, Env* env, + Slice* result_operand = nullptr); // Merge entries until we hit // - a corrupted key @@ -82,7 +84,8 @@ class MergeHelper { // with asserts removed). // // REQUIRED: The first key in the input is not corrupted. - Status MergeUntil(Iterator* iter, const SequenceNumber stop_before = 0, + Status MergeUntil(InternalIterator* iter, + const SequenceNumber stop_before = 0, const bool at_bottom = false); // Filters a merge operand using the compaction filter specified @@ -115,7 +118,9 @@ class MergeHelper { // So keys().back() was the first key seen by iterator. // TODO: Re-style this comment to be like the first one const std::deque& keys() const { return keys_; } - const std::deque& values() const { return operands_; } + const std::vector& values() const { + return merge_context_.GetOperands(); + } uint64_t TotalFilterTime() const { return total_filter_time_; } bool HasOperator() const { return user_merge_operator_ != nullptr; } @@ -132,8 +137,11 @@ class MergeHelper { // the scratch area that holds the result of MergeUntil // valid up to the next MergeUntil call - std::deque keys_; // Keeps track of the sequence of keys seen - std::deque operands_; // Parallel with keys_; stores the values + + // Keeps track of the sequence of keys seen + std::deque keys_; + // Parallel with keys_; stores the operands + mutable MergeContext merge_context_; StopWatchNano filter_timer_; uint64_t total_filter_time_; @@ -158,7 +166,7 @@ class MergeOutputIterator { private: const MergeHelper* merge_helper_; std::deque::const_reverse_iterator it_keys_; - std::deque::const_reverse_iterator it_values_; + std::vector::const_reverse_iterator it_values_; }; } // namespace rocksdb diff --git a/external/rocksdb/db/merge_helper_test.cc b/external/rocksdb/db/merge_helper_test.cc index 2ef0d39e4d..b21f560782 100644 --- a/external/rocksdb/db/merge_helper_test.cc +++ b/external/rocksdb/db/merge_helper_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/merge_operator.cc b/external/rocksdb/db/merge_operator.cc index c6645a910d..d4149f67ec 100644 --- a/external/rocksdb/db/merge_operator.cc +++ b/external/rocksdb/db/merge_operator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,6 +11,18 @@ namespace rocksdb { +bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // If FullMergeV2 is not implemented, we convert the operand_list to + // std::deque and pass it to FullMerge + std::deque operand_list_str; + for (auto& op : merge_in.operand_list) { + operand_list_str.emplace_back(op.data(), op.size()); + } + return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str, + &merge_out->new_value, merge_in.logger); +} + // The default implementation of PartialMergeMulti, which invokes // PartialMerge multiple times internally and merges two operands at // a time. @@ -39,23 +51,20 @@ bool MergeOperator::PartialMergeMulti(const Slice& key, // Given a "real" merge from the library, call the user's // associative merge function one-by-one on each of the operands. // NOTE: It is assumed that the client's merge-operator will handle any errors. -bool AssociativeMergeOperator::FullMerge( - const Slice& key, - const Slice* existing_value, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const { - +bool AssociativeMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { // Simply loop through the operands Slice temp_existing; - for (const auto& operand : operand_list) { - Slice value(operand); + const Slice* existing_value = merge_in.existing_value; + for (const auto& operand : merge_in.operand_list) { std::string temp_value; - if (!Merge(key, existing_value, value, &temp_value, logger)) { + if (!Merge(merge_in.key, existing_value, operand, &temp_value, + merge_in.logger)) { return false; } - swap(temp_value, *new_value); - temp_existing = Slice(*new_value); + swap(temp_value, merge_out->new_value); + temp_existing = Slice(merge_out->new_value); existing_value = &temp_existing; } diff --git a/external/rocksdb/db/merge_test.cc b/external/rocksdb/db/merge_test.cc index 192ea2fec7..020f33ba6f 100644 --- a/external/rocksdb/db/merge_test.cc +++ b/external/rocksdb/db/merge_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,7 +20,6 @@ #include "utilities/merge_operators.h" #include "util/testharness.h" -using namespace std; using namespace rocksdb; namespace { @@ -76,7 +75,7 @@ class CountMergeOperator : public AssociativeMergeOperator { }; namespace { -std::shared_ptr OpenDb(const string& dbname, const bool ttl = false, +std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, const size_t max_successive_merges = 0, const uint32_t min_partial_merge_operands = 2) { DB* db; @@ -90,7 +89,7 @@ std::shared_ptr OpenDb(const string& dbname, const bool ttl = false, // DBWithTTL is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE if (ttl) { - cout << "Opening database with TTL\n"; + std::cout << "Opening database with TTL\n"; DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options, dbname, &db_with_ttl); db = db_with_ttl; @@ -102,7 +101,7 @@ std::shared_ptr OpenDb(const string& dbname, const bool ttl = false, s = DB::Open(options, dbname, &db); #endif // !ROCKSDB_LITE if (!s.ok()) { - cerr << s.ToString() << endl; + std::cerr << s.ToString() << std::endl; assert(false); } return std::shared_ptr(db); @@ -142,7 +141,7 @@ class Counters { // if the underlying level db operation failed. // mapped to a levedb Put - bool set(const string& key, uint64_t value) { + bool set(const std::string& key, uint64_t value) { // just treat the internal rep of int64 as the string Slice slice((char *)&value, sizeof(value)); auto s = db_->Put(put_option_, key, slice); @@ -150,26 +149,26 @@ class Counters { if (s.ok()) { return true; } else { - cerr << s.ToString() << endl; + std::cerr << s.ToString() << std::endl; return false; } } // mapped to a rocksdb Delete - bool remove(const string& key) { + bool remove(const std::string& key) { auto s = db_->Delete(delete_option_, key); if (s.ok()) { return true; } else { - cerr << s.ToString() << std::endl; + std::cerr << s.ToString() << std::endl; return false; } } // mapped to a rocksdb Get - bool get(const string& key, uint64_t *value) { - string str; + bool get(const std::string& key, uint64_t* value) { + std::string str; auto s = db_->Get(get_option_, key, &str); if (s.IsNotFound()) { @@ -179,35 +178,33 @@ class Counters { } else if (s.ok()) { // deserialization if (str.size() != sizeof(uint64_t)) { - cerr << "value corruption\n"; + std::cerr << "value corruption\n"; return false; } *value = DecodeFixed64(&str[0]); return true; } else { - cerr << s.ToString() << std::endl; + std::cerr << s.ToString() << std::endl; return false; } } // 'add' is implemented as get -> modify -> set // An alternative is a single merge operation, see MergeBasedCounters - virtual bool add(const string& key, uint64_t value) { + virtual bool add(const std::string& key, uint64_t value) { uint64_t base = default_; return get(key, &base) && set(key, base + value); } // convenience functions for testing - void assert_set(const string& key, uint64_t value) { + void assert_set(const std::string& key, uint64_t value) { assert(set(key, value)); } - void assert_remove(const string& key) { - assert(remove(key)); - } + void assert_remove(const std::string& key) { assert(remove(key)); } - uint64_t assert_get(const string& key) { + uint64_t assert_get(const std::string& key) { uint64_t value = default_; int result = get(key, &value); assert(result); @@ -215,7 +212,7 @@ class Counters { return value; } - void assert_add(const string& key, uint64_t value) { + void assert_add(const std::string& key, uint64_t value) { int result = add(key, value); assert(result); if (result == 0) exit(1); // Disable unused variable warning. @@ -234,7 +231,7 @@ class MergeBasedCounters : public Counters { } // mapped to a rocksdb Merge operation - virtual bool add(const string& key, uint64_t value) override { + virtual bool add(const std::string& key, uint64_t value) override { char encoded[sizeof(uint64_t)]; EncodeFixed64(encoded, value); Slice slice(encoded, sizeof(uint64_t)); @@ -243,7 +240,7 @@ class MergeBasedCounters : public Counters { if (s.ok()) { return true; } else { - cerr << s.ToString() << endl; + std::cerr << s.ToString() << std::endl; return false; } } @@ -254,7 +251,7 @@ void dumpDb(DB* db) { auto it = unique_ptr(db->NewIterator(ReadOptions())); for (it->SeekToFirst(); it->Valid(); it->Next()) { uint64_t value = DecodeFixed64(it->value().data()); - cout << it->key().ToString() << ": " << value << endl; + std::cout << it->key().ToString() << ": " << value << std::endl; } assert(it->status().ok()); // Check for any errors found during the scan } @@ -302,9 +299,9 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) { if (test_compaction) { db->Flush(o); - cout << "Compaction started ...\n"; + std::cout << "Compaction started ...\n"; db->CompactRange(CompactRangeOptions(), nullptr, nullptr); - cout << "Compaction ended\n"; + std::cout << "Compaction ended\n"; dumpDb(db); @@ -400,7 +397,7 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, // Get the value resetNumMergeOperatorCalls(); - string get_value_str; + std::string get_value_str; { Status s = db->Get(ReadOptions(), key, &get_value_str); assert(s.ok()); @@ -412,24 +409,24 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, static_cast((num_merges % (max_num_merges + 1)))); } -void runTest(int argc, const string& dbname, const bool use_ttl = false) { +void runTest(int argc, const std::string& dbname, const bool use_ttl = false) { bool compact = false; if (argc > 1) { compact = true; - cout << "Turn on Compaction\n"; + std::cout << "Turn on Compaction\n"; } { auto db = OpenDb(dbname, use_ttl); { - cout << "Test read-modify-write counters... \n"; + std::cout << "Test read-modify-write counters... \n"; Counters counters(db, 0); testCounters(counters, db.get(), true); } { - cout << "Test merge-based counters... \n"; + std::cout << "Test merge-based counters... \n"; MergeBasedCounters counters(db, 0); testCounters(counters, db.get(), compact); } @@ -438,7 +435,7 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) { DestroyDB(dbname, Options()); { - cout << "Test merge in memtable... \n"; + std::cout << "Test merge in memtable... \n"; size_t max_merge = 5; auto db = OpenDb(dbname, use_ttl, max_merge); MergeBasedCounters counters(db, 0); @@ -449,7 +446,7 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) { } { - cout << "Test Partial-Merge\n"; + std::cout << "Test Partial-Merge\n"; size_t max_merge = 100; for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) { for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) { @@ -469,7 +466,7 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) { } { - cout << "Test merge-operator not set after reopen\n"; + std::cout << "Test merge-operator not set after reopen\n"; { auto db = OpenDb(dbname); MergeBasedCounters counters(db, 0); @@ -489,7 +486,7 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) { /* Temporary remove this test { - cout << "Test merge-operator not set after reopen (recovery case)\n"; + std::cout << "Test merge-operator not set after reopen (recovery case)\n"; { auto db = OpenDb(dbname); MergeBasedCounters counters(db, 0); diff --git a/external/rocksdb/db/options_file_test.cc b/external/rocksdb/db/options_file_test.cc new file mode 100644 index 0000000000..fbbc8c5529 --- /dev/null +++ b/external/rocksdb/db/options_file_test.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE +#include + +#include "db/db_impl.h" +#include "db/db_test_util.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "util/testharness.h" + +namespace rocksdb { +class OptionsFileTest : public testing::Test { + public: + OptionsFileTest() : dbname_(test::TmpDir() + "/options_file_test") {} + + std::string dbname_; +}; + +namespace { +void UpdateOptionsFiles(DB* db, + std::unordered_set* filename_history, + int* options_files_count) { + std::vector filenames; + db->GetEnv()->GetChildren(db->GetName(), &filenames); + uint64_t number; + FileType type; + *options_files_count = 0; + for (auto filename : filenames) { + if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { + filename_history->insert(filename); + (*options_files_count)++; + } + } +} + +// Verify whether the current Options Files are the latest ones. +void VerifyOptionsFileName( + DB* db, const std::unordered_set& past_filenames) { + std::vector filenames; + std::unordered_set current_filenames; + db->GetEnv()->GetChildren(db->GetName(), &filenames); + uint64_t number; + FileType type; + for (auto filename : filenames) { + if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { + current_filenames.insert(filename); + } + } + for (auto past_filename : past_filenames) { + if (current_filenames.find(past_filename) != current_filenames.end()) { + continue; + } + for (auto filename : current_filenames) { + ASSERT_GT(filename, past_filename); + } + } +} +} // namespace + +TEST_F(OptionsFileTest, NumberOfOptionsFiles) { + const int kReopenCount = 20; + Options opt; + opt.create_if_missing = true; + DestroyDB(dbname_, opt); + std::unordered_set filename_history; + DB* db; + for (int i = 0; i < kReopenCount; ++i) { + ASSERT_OK(DB::Open(opt, dbname_, &db)); + int num_options_files = 0; + UpdateOptionsFiles(db, &filename_history, &num_options_files); + ASSERT_GT(num_options_files, 0); + ASSERT_LE(num_options_files, 2); + // Make sure we always keep the latest option files. + VerifyOptionsFileName(db, filename_history); + delete db; + } +} + +TEST_F(OptionsFileTest, OptionsFileName) { + const uint64_t kOptionsFileNum = 12345; + uint64_t number; + FileType type; + + auto options_file_name = OptionsFileName("", kOptionsFileNum); + ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr)); + ASSERT_EQ(type, kOptionsFile); + ASSERT_EQ(number, kOptionsFileNum); + + const uint64_t kTempOptionsFileNum = 54352; + auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum); + ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr)); + ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix), + std::string::npos); + ASSERT_EQ(type, kTempFile); + ASSERT_EQ(number, kTempOptionsFileNum); +} +} // namespace rocksdb + +int main(int argc, char** argv) { +#if !(defined NDEBUG) || !defined(OS_WIN) + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + return 0; +#endif // !(defined NDEBUG) || !defined(OS_WIN) +} +#else + +#include + +int main(int argc, char** argv) { + printf("Skipped as Options file is not supported in RocksDBLite.\n"); + return 0; +} +#endif // !ROCKSDB_LITE diff --git a/external/rocksdb/db/perf_context_test.cc b/external/rocksdb/db/perf_context_test.cc index 359562a167..90d724dbe4 100644 --- a/external/rocksdb/db/perf_context_test.cc +++ b/external/rocksdb/db/perf_context_test.cc @@ -1,22 +1,24 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #include #include +#include #include #include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/memtablerep.h" #include "util/histogram.h" +#include "util/instrumented_mutex.h" #include "util/stop_watch.h" +#include "util/string_util.h" #include "util/testharness.h" #include "util/thread_status_util.h" -#include "util/string_util.h" - +#include "utilities/merge_operators.h" bool FLAGS_random_key = false; bool FLAGS_use_set_based_memetable = false; @@ -35,6 +37,7 @@ std::shared_ptr OpenDb(bool read_only = false) { DB* db; Options options; options.create_if_missing = true; + options.max_open_files = -1; options.write_buffer_size = FLAGS_write_buffer_size; options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.min_write_buffer_number_to_merge = @@ -277,14 +280,19 @@ void ProfileQueries(bool enabled_time = false) { #endif for (const int i : keys) { + if (i == kFlushFlag) { + continue; + } std::string key = "k" + ToString(i); - std::string value = "v" + ToString(i); + std::string expected_value = "v" + ToString(i); + std::string value; std::vector multiget_keys = {Slice(key)}; std::vector values; perf_context.Reset(); - db->Get(read_options, key, &value); + ASSERT_OK(db->Get(read_options, key, &value)); + ASSERT_EQ(expected_value, value); hist_get_snapshot.Add(perf_context.get_snapshot_time); hist_get_memtable.Add(perf_context.get_from_memtable_time); hist_get_files.Add(perf_context.get_from_output_files_time); @@ -373,14 +381,19 @@ void ProfileQueries(bool enabled_time = false) { hist_mget_num_memtable_checked.Clear(); for (const int i : keys) { + if (i == kFlushFlag) { + continue; + } std::string key = "k" + ToString(i); - std::string value = "v" + ToString(i); + std::string expected_value = "v" + ToString(i); + std::string value; std::vector multiget_keys = {Slice(key)}; std::vector values; perf_context.Reset(); - db->Get(read_options, key, &value); + ASSERT_OK(db->Get(read_options, key, &value)); + ASSERT_EQ(expected_value, value); hist_get_snapshot.Add(perf_context.get_snapshot_time); hist_get_memtable.Add(perf_context.get_from_memtable_time); hist_get_files.Add(perf_context.get_from_output_files_time); @@ -444,6 +457,7 @@ void ProfileQueries(bool enabled_time = false) { } } +#ifndef ROCKSDB_LITE TEST_F(PerfContextTest, KeyComparisonCount) { SetPerfLevel(kEnableCount); ProfileQueries(); @@ -454,6 +468,7 @@ TEST_F(PerfContextTest, KeyComparisonCount) { SetPerfLevel(kEnableTime); ProfileQueries(true); } +#endif // ROCKSDB_LITE // make perf_context_test // export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison @@ -539,6 +554,100 @@ TEST_F(PerfContextTest, SeekKeyComparison) { } } +TEST_F(PerfContextTest, DBMutexLockCounter) { + int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; + for (PerfLevel perf_level : + {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) { + for (int c = 0; c < 2; ++c) { + InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); + mutex.Lock(); + std::thread child_thread([&] { + SetPerfLevel(perf_level); + perf_context.Reset(); + ASSERT_EQ(perf_context.db_mutex_lock_nanos, 0); + mutex.Lock(); + mutex.Unlock(); + if (perf_level == PerfLevel::kEnableTimeExceptForMutex || + stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ASSERT_EQ(perf_context.db_mutex_lock_nanos, 0); + } else { + // increment the counter only when it's a DB Mutex + ASSERT_GT(perf_context.db_mutex_lock_nanos, 0); + } + }); + Env::Default()->SleepForMicroseconds(100); + mutex.Unlock(); + child_thread.join(); + } + } +} + +TEST_F(PerfContextTest, FalseDBMutexWait) { + SetPerfLevel(kEnableTime); + int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; + for (int c = 0; c < 2; ++c) { + InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); + InstrumentedCondVar lock(&mutex); + perf_context.Reset(); + mutex.Lock(); + lock.TimedWait(100); + mutex.Unlock(); + if (stats_code[c] == static_cast(DB_MUTEX_WAIT_MICROS)) { + // increment the counter only when it's a DB Mutex + ASSERT_GT(perf_context.db_condition_wait_nanos, 0); + } else { + ASSERT_EQ(perf_context.db_condition_wait_nanos, 0); + } + } +} + +TEST_F(PerfContextTest, ToString) { + perf_context.Reset(); + perf_context.block_read_count = 12345; + + std::string zero_included = perf_context.ToString(); + ASSERT_NE(std::string::npos, zero_included.find("= 0")); + ASSERT_NE(std::string::npos, zero_included.find("= 12345")); + + std::string zero_excluded = perf_context.ToString(true); + ASSERT_EQ(std::string::npos, zero_excluded.find("= 0")); + ASSERT_NE(std::string::npos, zero_excluded.find("= 12345")); +} + +TEST_F(PerfContextTest, MergeOperatorTime) { + DestroyDB(kDbName, Options()); + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Status s = DB::Open(options, kDbName, &db); + EXPECT_OK(s); + + std::string val; + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4")); + + SetPerfLevel(kEnableTime); + perf_context.Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + EXPECT_GT(perf_context.merge_operator_time_nanos, 0); + + ASSERT_OK(db->Flush(FlushOptions())); + + perf_context.Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + EXPECT_GT(perf_context.merge_operator_time_nanos, 0); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + perf_context.Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + EXPECT_GT(perf_context.merge_operator_time_nanos, 0); + + delete db; +} } int main(int argc, char** argv) { diff --git a/external/rocksdb/db/pinned_iterators_manager.h b/external/rocksdb/db/pinned_iterators_manager.h new file mode 100644 index 0000000000..38cb89ce0c --- /dev/null +++ b/external/rocksdb/db/pinned_iterators_manager.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include +#include +#include + +#include "table/internal_iterator.h" + +namespace rocksdb { + +// PinnedIteratorsManager will be notified whenever we need to pin an Iterator +// and it will be responsible for deleting pinned Iterators when they are +// not needed anymore. +class PinnedIteratorsManager { + public: + PinnedIteratorsManager() : pinning_enabled(false), pinned_iters_(nullptr) {} + ~PinnedIteratorsManager() { ReleasePinnedIterators(); } + + // Enable Iterators pinning + void StartPinning() { + if (!pinning_enabled) { + pinning_enabled = true; + if (!pinned_iters_) { + pinned_iters_.reset(new std::vector()); + } + } + } + + // Is pinning enabled ? + bool PinningEnabled() { return pinning_enabled; } + + // Take ownership of iter if pinning is enabled and delete it when + // ReleasePinnedIterators() is called + void PinIteratorIfNeeded(InternalIterator* iter) { + if (!pinning_enabled || !iter) { + return; + } + pinned_iters_->push_back(iter); + } + + // Release pinned Iterators + inline void ReleasePinnedIterators() { + if (pinning_enabled) { + pinning_enabled = false; + + // Remove duplicate pointers + std::sort(pinned_iters_->begin(), pinned_iters_->end()); + std::unique(pinned_iters_->begin(), pinned_iters_->end()); + + for (auto& iter : *pinned_iters_) { + delete iter; + } + pinned_iters_->clear(); + } + } + + private: + bool pinning_enabled; + std::unique_ptr> pinned_iters_; +}; + +} // namespace rocksdb diff --git a/external/rocksdb/db/plain_table_db_test.cc b/external/rocksdb/db/plain_table_db_test.cc index d9c0082367..fdf98b0bf9 100644 --- a/external/rocksdb/db/plain_table_db_test.cc +++ b/external/rocksdb/db/plain_table_db_test.cc @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -29,6 +29,7 @@ #include "table/bloom_block.h" #include "table/table_builder.h" #include "table/plain_table_factory.h" +#include "table/plain_table_key_coding.h" #include "table/plain_table_reader.h" #include "util/hash.h" #include "util/logging.h" @@ -41,6 +42,59 @@ using std::unique_ptr; namespace rocksdb { +class PlainTableKeyDecoderTest : public testing::Test {}; + +TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) { + std::string tmp; + Random rnd(301); + const uint32_t kLength = 2222; + Slice contents = test::RandomString(&rnd, kLength, &tmp); + test::StringSource* string_source = + new test::StringSource(contents, 0, false); + + unique_ptr file_reader( + test::GetRandomAccessFileReader(string_source)); + unique_ptr file_info(new PlainTableReaderFileInfo( + std::move(file_reader), EnvOptions(), kLength)); + + { + PlainTableFileReader reader(file_info.get()); + + const uint32_t kReadSize = 77; + for (uint32_t pos = 0; pos < kLength; pos += kReadSize) { + uint32_t read_size = std::min(kLength - pos, kReadSize); + Slice out; + ASSERT_TRUE(reader.Read(pos, read_size, &out)); + ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size))); + } + + ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2); + } + + std::vector>> reads = { + {{600, 30}, {590, 30}, {600, 20}, {600, 40}}, + {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}}, + {{1000, 20}, {500, 20}, {1000, 50}}, + {{1000, 20}, {500, 20}, {500, 20}}, + {{1000, 20}, {500, 20}, {200, 20}, {500, 20}}, + {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}}, + {{600, 500}, {610, 20}, {100, 20}}, + {{500, 100}, {490, 100}, {550, 50}}, + }; + + std::vector num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2}; + + for (size_t i = 0; i < reads.size(); i++) { + string_source->set_total_reads(0); + PlainTableFileReader reader(file_info.get()); + for (auto p : reads[i]) { + Slice out; + ASSERT_TRUE(reader.Read(p.first, p.second, &out)); + ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second))); + } + ASSERT_EQ(num_file_reads[i], string_source->total_reads()); + } +} class PlainTableDBTest : public testing::Test, public testing::WithParamInterface { @@ -209,7 +263,9 @@ class TestPlainTableReader : public PlainTableReader { const TableProperties* table_properties, unique_ptr&& file, const ImmutableCFOptions& ioptions, - bool* expect_bloom_not_match, bool store_index_in_file) + bool* expect_bloom_not_match, bool store_index_in_file, + uint32_t column_family_id, + const std::string& column_family_name) : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { @@ -222,6 +278,8 @@ class TestPlainTableReader : public PlainTableReader { EXPECT_TRUE(s.ok()); TableProperties* props = const_cast(table_properties); + EXPECT_EQ(column_family_id, static_cast(props->column_family_id)); + EXPECT_EQ(column_family_name, props->column_family_name); if (store_index_in_file) { auto bloom_version_ptr = props->user_collected_properties.find( PlainTablePropertyNames::kBloomVersion); @@ -254,35 +312,39 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableFactory : public PlainTableFactory { public: explicit TestPlainTableFactory(bool* expect_bloom_not_match, - const PlainTableOptions& options) + const PlainTableOptions& options, + uint32_t column_family_id, + std::string column_family_name) : PlainTableFactory(options), bloom_bits_per_key_(options.bloom_bits_per_key), hash_table_ratio_(options.hash_table_ratio), index_sparseness_(options.index_sparseness), store_index_in_file_(options.store_index_in_file), - expect_bloom_not_match_(expect_bloom_not_match) {} - - Status NewTableReader(const TableReaderOptions& table_reader_options, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table) const override { + expect_bloom_not_match_(expect_bloom_not_match), + column_family_id_(column_family_id), + column_family_name_(std::move(column_family_name)) {} + + Status NewTableReader( + const TableReaderOptions& table_reader_options, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions.env, - table_reader_options.ioptions.info_log, &props); + table_reader_options.ioptions, &props); EXPECT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions.env, + table_reader_options.ioptions, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); EXPECT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions.env, + table_reader_options.ioptions, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); EXPECT_TRUE(s.ok()); @@ -300,7 +362,7 @@ class TestPlainTableFactory : public PlainTableFactory { table_reader_options.internal_comparator, encoding_type, file_size, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, std::move(file), table_reader_options.ioptions, expect_bloom_not_match_, - store_index_in_file_)); + store_index_in_file_, column_family_id_, column_family_name_)); *table = std::move(new_reader); return s; @@ -312,6 +374,8 @@ class TestPlainTableFactory : public PlainTableFactory { size_t index_sparseness_; bool store_index_in_file_; bool* expect_bloom_not_match_; + const uint32_t column_family_id_; + const std::string column_family_name_; }; TEST_P(PlainTableDBTest, Flush) { @@ -438,7 +502,8 @@ TEST_P(PlainTableDBTest, Flush2) { plain_table_options.encoding_type = encoding_type; plain_table_options.store_index_in_file = store_index_in_file; options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options)); + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); DestroyAndReopen(&options); ASSERT_OK(Put("0000000000000bar", "b")); @@ -507,7 +572,8 @@ TEST_P(PlainTableDBTest, Iterator) { plain_table_options.encoding_type = encoding_type; options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options)); + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); } else { PlainTableOptions plain_table_options; plain_table_options.user_key_len = 16; @@ -518,7 +584,8 @@ TEST_P(PlainTableDBTest, Iterator) { plain_table_options.encoding_type = encoding_type; options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options)); + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); } DestroyAndReopen(&options); diff --git a/external/rocksdb/db/prefix_test.cc b/external/rocksdb/db/prefix_test.cc index d095d444f2..1fb22994ea 100644 --- a/external/rocksdb/db/prefix_test.cc +++ b/external/rocksdb/db/prefix_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,9 +20,11 @@ int main() { #include #include "rocksdb/comparator.h" #include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/table.h" #include "util/histogram.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -41,9 +43,8 @@ DEFINE_int64(write_buffer_size, 33554432, ""); DEFINE_int32(max_write_buffer_number, 2, ""); DEFINE_int32(min_write_buffer_number_to_merge, 1, ""); DEFINE_int32(skiplist_height, 4, ""); -DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); -DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); -DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, ""); +DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, ""); +DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, ""); DEFINE_int32(value_size, 40, ""); // Path to the database on file system @@ -145,6 +146,35 @@ std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix, } return result; } + +class SamePrefixTransform : public SliceTransform { + private: + const Slice prefix_; + std::string name_; + + public: + explicit SamePrefixTransform(const Slice& prefix) + : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {} + + virtual const char* Name() const override { return name_.c_str(); } + + virtual Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return prefix_; + } + + virtual bool InDomain(const Slice& src) const override { + if (src.size() >= prefix_.size()) { + return Slice(src.data(), prefix_.size()) == prefix_; + } + return false; + } + + virtual bool InRange(const Slice& dst) const override { + return dst == prefix_; + } +}; + } // namespace class PrefixTest : public testing::Test { @@ -158,10 +188,15 @@ class PrefixTest : public testing::Test { options.min_write_buffer_number_to_merge = FLAGS_min_write_buffer_number_to_merge; - options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits; - options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes; - options.memtable_prefix_bloom_huge_page_tlb_size = - FLAGS_memtable_prefix_bloom_huge_page_tlb_size; + options.memtable_prefix_bloom_size_ratio = + FLAGS_memtable_prefix_bloom_size_ratio; + options.memtable_huge_page_size = FLAGS_memtable_huge_page_size; + + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Status s = DB::Open(options, kDbName, &db); EXPECT_OK(s); @@ -220,6 +255,55 @@ class PrefixTest : public testing::Test { Options options; }; +TEST(SamePrefixTest, InDomainTest) { + DB* db; + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(new SamePrefixTransform("HHKB")); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + WriteOptions write_options; + ReadOptions read_options; + { + ASSERT_OK(DB::Open(options, kDbName, &db)); + ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006")); + ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011")); + ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk")); + db->Flush(FlushOptions()); + std::string result; + auto db_iter = db->NewIterator(ReadOptions()); + + db_iter->Seek("Realforce 87u"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), "Realforce 87u"); + ASSERT_EQ(db_iter->value(), "idk"); + + delete db_iter; + delete db; + ASSERT_OK(DestroyDB(kDbName, Options())); + } + + { + ASSERT_OK(DB::Open(options, kDbName, &db)); + ASSERT_OK(db->Put(write_options, "pikachu", "1")); + ASSERT_OK(db->Put(write_options, "Meowth", "1")); + ASSERT_OK(db->Put(write_options, "Mewtwo", "idk")); + db->Flush(FlushOptions()); + std::string result; + auto db_iter = db->NewIterator(ReadOptions()); + + db_iter->Seek("Mewtwo"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + delete db_iter; + delete db; + ASSERT_OK(DestroyDB(kDbName, Options())); + } +} + TEST_F(PrefixTest, TestResult) { for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { FirstOption(); @@ -393,6 +477,59 @@ TEST_F(PrefixTest, TestResult) { } } +// Show results in prefix +TEST_F(PrefixTest, PrefixValid) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + // Insert keys with common prefix and one key with different + Slice v16("v16"); + Slice v17("v17"); + Slice v18("v18"); + Slice v19("v19"); + PutKey(db.get(), write_options, 12345, 6, v16); + PutKey(db.get(), write_options, 12345, 7, v17); + PutKey(db.get(), write_options, 12345, 8, v18); + PutKey(db.get(), write_options, 12345, 9, v19); + PutKey(db.get(), write_options, 12346, 8, v16); + db->Flush(FlushOptions()); + db->Delete(write_options, TestKeyToSlice(TestKey(12346, 8))); + db->Flush(FlushOptions()); + read_options.prefix_same_as_start = true; + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 12345, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v19 == iter->value()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8)); + + // Verify seeking past the prefix won't return a result. + SeekIterator(iter.get(), 12345, 10); + ASSERT_TRUE(!iter->Valid()); + } + } +} + TEST_F(PrefixTest, DynamicPrefixIterator) { while (NextOptions(FLAGS_bucket_count)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() diff --git a/external/rocksdb/db/repair.cc b/external/rocksdb/db/repair.cc index d1ef6db746..a46e33f2dd 100644 --- a/external/rocksdb/db/repair.cc +++ b/external/rocksdb/db/repair.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -74,15 +74,16 @@ #include "db/memtable.h" #include "db/table_cache.h" #include "db/version_edit.h" -#include "db/writebuffer.h" #include "db/write_batch_internal.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/options.h" #include "rocksdb/immutable_options.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" #include "util/file_reader_writer.h" -#include "util/scoped_arena_iterator.h" +#include "util/string_util.h" namespace rocksdb { @@ -90,43 +91,114 @@ namespace { class Repairer { public: - Repairer(const std::string& dbname, const Options& options) + Repairer(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& default_cf_opts, + const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs) : dbname_(dbname), - env_(options.env), - icmp_(options.comparator), - options_(SanitizeOptions(dbname, &icmp_, options)), - ioptions_(options_), + env_(db_options.env), + env_options_(), + db_options_(SanitizeOptions(dbname_, db_options)), + icmp_(default_cf_opts.comparator), + default_cf_opts_(default_cf_opts), + default_cf_iopts_( + ImmutableCFOptions(Options(db_options_, default_cf_opts))), + unknown_cf_opts_(unknown_cf_opts), + create_unknown_cfs_(create_unknown_cfs), raw_table_cache_( // TableCache can be small since we expect each table to be opened // once. - NewLRUCache(10, options_.table_cache_numshardbits)), + NewLRUCache(10, db_options_.table_cache_numshardbits)), + table_cache_(new TableCache(default_cf_iopts_, env_options_, + raw_table_cache_.get())), + wb_(db_options_.db_write_buffer_size), + wc_(db_options_.delayed_write_rate), + vset_(dbname_, &db_options_, env_options_, raw_table_cache_.get(), &wb_, + &wc_), next_file_number_(1) { - GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories_); + for (const auto& cfd : column_families) { + cf_name_to_opts_[cfd.name] = cfd.options; + } + } - table_cache_ = - new TableCache(ioptions_, env_options_, raw_table_cache_.get()); - edit_ = new VersionEdit(); + const ColumnFamilyOptions* GetColumnFamilyOptions( + const std::string& cf_name) { + if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) { + if (create_unknown_cfs_) { + return &unknown_cf_opts_; + } + return nullptr; + } + return &cf_name_to_opts_[cf_name]; + } + + // Adds a column family to the VersionSet with cf_options_ and updates + // manifest. + Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + const auto* cf_opts = GetColumnFamilyOptions(cf_name); + if (cf_opts == nullptr) { + return Status::Corruption("Encountered unknown column family with name=" + + cf_name + ", id=" + ToString(cf_id)); + } + Options opts(db_options_, *cf_opts); + MutableCFOptions mut_cf_opts(opts, ImmutableCFOptions(opts)); + + VersionEdit edit; + edit.SetComparatorName(opts.comparator->Name()); + edit.SetLogNumber(0); + edit.SetColumnFamily(cf_id); + ColumnFamilyData* cfd; + cfd = nullptr; + edit.AddColumnFamily(cf_name); + + mutex_.Lock(); + Status status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, + nullptr /* db_directory */, + false /* new_descriptor_log */, cf_opts); + mutex_.Unlock(); + return status; } ~Repairer() { delete table_cache_; - raw_table_cache_.reset(); - delete edit_; } Status Run() { Status status = FindFiles(); if (status.ok()) { + // Discard older manifests and start a fresh one + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + // Just create a DBImpl temporarily so we can reuse NewDB() + DBImpl* db_impl = new DBImpl(db_options_, dbname_); + status = db_impl->NewDB(); + delete db_impl; + } + if (status.ok()) { + // Recover using the fresh manifest created by NewDB() + status = + vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false); + } + if (status.ok()) { + // Need to scan existing SST files first so the column families are + // created before we process WAL files + ExtractMetaData(); + + // ExtractMetaData() uses table_fds_ to know which SST files' metadata to + // extract -- we need to clear it here since metadata for existing SST + // files has been extracted already + table_fds_.clear(); ConvertLogFilesToTables(); ExtractMetaData(); - status = WriteDescriptor(); + status = AddTables(); } if (status.ok()) { uint64_t bytes = 0; for (size_t i = 0; i < tables_.size(); i++) { bytes += tables_[i].meta.fd.GetFileSize(); } - Log(InfoLogLevel::WARN_LEVEL, options_.info_log, + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "**** Repaired rocksdb %s; " "recovered %" ROCKSDB_PRIszt " files; %" PRIu64 "bytes. " @@ -140,34 +212,41 @@ class Repairer { private: struct TableInfo { FileMetaData meta; + uint32_t column_family_id; + std::string column_family_name; SequenceNumber min_sequence; SequenceNumber max_sequence; }; std::string const dbname_; Env* const env_; + const EnvOptions env_options_; + const DBOptions db_options_; const InternalKeyComparator icmp_; - std::vector> - int_tbl_prop_collector_factories_; - const Options options_; - const ImmutableCFOptions ioptions_; + const ColumnFamilyOptions default_cf_opts_; + const ImmutableCFOptions default_cf_iopts_; // table_cache_ holds reference + const ColumnFamilyOptions unknown_cf_opts_; + const bool create_unknown_cfs_; std::shared_ptr raw_table_cache_; TableCache* table_cache_; - VersionEdit* edit_; + WriteBufferManager wb_; + WriteController wc_; + VersionSet vset_; + std::unordered_map cf_name_to_opts_; + InstrumentedMutex mutex_; std::vector manifests_; std::vector table_fds_; std::vector logs_; std::vector tables_; uint64_t next_file_number_; - const EnvOptions env_options_; Status FindFiles() { std::vector filenames; bool found_file = false; - for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) { + for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { Status status = - env_->GetChildren(options_.db_paths[path_id].path, &filenames); + env_->GetChildren(db_options_.db_paths[path_id].path, &filenames); if (!status.ok()) { return status; } @@ -190,7 +269,8 @@ class Repairer { assert(path_id == 0); logs_.push_back(number); } else if (type == kTableFile) { - table_fds_.emplace_back(number, path_id, 0); + table_fds_.emplace_back(number, static_cast(path_id), + 0); } else { // Ignore other files } @@ -209,7 +289,7 @@ class Repairer { std::string logname = LogFileName(dbname_, logs_[i]); Status status = ConvertLogToTable(logs_[i]); if (!status.ok()) { - Log(InfoLogLevel::WARN_LEVEL, options_.info_log, + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i], status.ToString().c_str()); } @@ -243,69 +323,78 @@ class Repairer { // Create the log reader. LogReporter reporter; reporter.env = env_; - reporter.info_log = options_.info_log; + reporter.info_log = db_options_.info_log; reporter.lognum = log; - // We intentially make log::Reader do checksumming so that + // We intentionally make log::Reader do checksumming so that // corruptions cause entire commits to be skipped instead of // propagating bad information (like overly large sequence // numbers). - log::Reader reader(std::move(lfile_reader), &reporter, - true /*enable checksum*/, 0 /*initial_offset*/); + log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter, + true /*enable checksum*/, 0 /*initial_offset*/, log); + + // Initialize per-column family memtables + for (auto* cfd : *vset_.GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet()); // Read all the records and add to a memtable std::string scratch; Slice record; WriteBatch batch; - WriteBuffer wb(options_.db_write_buffer_size); - MemTable* mem = - new MemTable(icmp_, ioptions_, MutableCFOptions(options_, ioptions_), - &wb, kMaxSequenceNumber); - auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem); - mem->Ref(); int counter = 0; while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { + if (record.size() < WriteBatchInternal::kHeader) { reporter.Corruption( record.size(), Status::Corruption("log record too small")); continue; } WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, cf_mems_default); + status = WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr); if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { - Log(InfoLogLevel::WARN_LEVEL, - options_.info_log, "Log #%" PRIu64 ": ignoring %s", log, - status.ToString().c_str()); + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Log #%" PRIu64 ": ignoring %s", log, status.ToString().c_str()); status = Status::OK(); // Keep going with rest of file } } - // Do not record a version edit for this conversion to a Table - // since ExtractMetaData() will also generate edits. - FileMetaData meta; - meta.fd = FileDescriptor(next_file_number_++, 0, 0); - { + // Dump a table for each column family with entries in this log file. + for (auto* cfd : *vset_.GetColumnFamilySet()) { + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + MemTable* mem = cfd->mem(); + if (mem->IsEmpty()) { + continue; + } + + FileMetaData meta; + meta.fd = FileDescriptor(next_file_number_++, 0, 0); ReadOptions ro; ro.total_order_seek = true; Arena arena; ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); - status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_, - iter.get(), &meta, icmp_, - &int_tbl_prop_collector_factories_, {}, - kNoCompression, CompressionOptions(), false, nullptr); - } - delete mem->Unref(); - delete cf_mems_default; - mem = nullptr; - if (status.ok()) { - if (meta.fd.GetFileSize() > 0) { - table_fds_.push_back(meta.fd); + status = BuildTable( + dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), + env_options_, table_cache_, iter.get(), &meta, + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber, kNoCompression, + CompressionOptions(), false, nullptr /* internal_stats */, + TableFileCreationReason::kRecovery); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, + counter, meta.fd.GetNumber(), status.ToString().c_str()); + if (status.ok()) { + if (meta.fd.GetFileSize() > 0) { + table_fds_.push_back(meta.fd); + } + } else { + break; } } - Log(InfoLogLevel::INFO_LEVEL, options_.info_log, - "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", - log, counter, meta.fd.GetNumber(), status.ToString().c_str()); + delete cf_mems; return status; } @@ -316,13 +405,12 @@ class Repairer { Status status = ScanTable(&t); if (!status.ok()) { std::string fname = TableFileName( - options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId()); + db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId()); char file_num_buf[kFormatFileNumberBufSize]; FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(), file_num_buf, sizeof(file_num_buf)); - Log(InfoLogLevel::WARN_LEVEL, options_.info_log, - "Table #%s: ignoring %s", file_num_buf, - status.ToString().c_str()); + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Table #%s: ignoring %s", file_num_buf, status.ToString().c_str()); ArchiveFile(fname); } else { tables_.push_back(t); @@ -331,16 +419,52 @@ class Repairer { } Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(options_.db_paths, t->meta.fd.GetNumber(), - t->meta.fd.GetPathId()); + std::string fname = TableFileName( + db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId()); int counter = 0; uint64_t file_size; Status status = env_->GetFileSize(fname, &file_size); t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(), file_size); + std::shared_ptr props; + if (status.ok()) { + status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd, + &props); + } if (status.ok()) { - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), env_options_, icmp_, t->meta.fd); + t->column_family_id = static_cast(props->column_family_id); + if (t->column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Table #%" PRIu64 + ": column family unknown (probably due to legacy format); " + "adding to default column family id 0.", + t->meta.fd.GetNumber()); + t->column_family_id = 0; + } + + if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) == + nullptr) { + status = + AddColumnFamily(props->column_family_name, t->column_family_id); + } + } + ColumnFamilyData* cfd = nullptr; + if (status.ok()) { + cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); + if (cfd->GetName() != props->column_family_name) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Table #%" PRIu64 + ": inconsistent column family name '%s'; expected '%s' for column " + "family id %" PRIu32 ".", + t->meta.fd.GetNumber(), props->column_family_name.c_str(), + cfd->GetName().c_str(), t->column_family_id); + status = Status::Corruption(dbname_, "inconsistent column family name"); + } + } + if (status.ok()) { + InternalIterator* iter = table_cache_->NewIterator( + ReadOptions(), env_options_, cfd->internal_comparator(), t->meta.fd); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; @@ -348,9 +472,9 @@ class Repairer { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); if (!ParseInternalKey(key, &parsed)) { - Log(InfoLogLevel::ERROR_LEVEL, - options_.info_log, "Table #%" PRIu64 ": unparsable key %s", - t->meta.fd.GetNumber(), EscapeString(key).c_str()); + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Table #%" PRIu64 ": unparsable key %s", t->meta.fd.GetNumber(), + EscapeString(key).c_str()); continue; } @@ -371,70 +495,51 @@ class Repairer { status = iter->status(); } delete iter; + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Table #%" PRIu64 ": %d entries %s", t->meta.fd.GetNumber(), counter, + status.ToString().c_str()); } - Log(InfoLogLevel::INFO_LEVEL, - options_.info_log, "Table #%" PRIu64 ": %d entries %s", - t->meta.fd.GetNumber(), counter, status.ToString().c_str()); return status; } - Status WriteDescriptor() { - std::string tmp = TempFileName(dbname_, 1); - unique_ptr file; - EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); - Status status = env_->NewWritableFile(tmp, &file, env_options); - if (!status.ok()) { - return status; - } - + Status AddTables() { + std::unordered_map> cf_id_to_tables; SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { + cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]); if (max_sequence < tables_[i].max_sequence) { max_sequence = tables_[i].max_sequence; } } + vset_.SetLastSequence(max_sequence); - edit_->SetComparatorName(icmp_.user_comparator()->Name()); - edit_->SetLogNumber(0); - edit_->SetNextFile(next_file_number_); - edit_->SetLastSequence(max_sequence); + for (const auto& cf_id_and_tables : cf_id_to_tables) { + auto* cfd = + vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first); + VersionEdit edit; + edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetLogNumber(0); + edit.SetNextFile(next_file_number_); + edit.SetColumnFamily(cfd->GetID()); - for (size_t i = 0; i < tables_.size(); i++) { // TODO(opt): separate out into multiple levels - const TableInfo& t = tables_[i]; - edit_->AddFile(0, t.meta.fd.GetNumber(), t.meta.fd.GetPathId(), - t.meta.fd.GetFileSize(), t.meta.smallest, t.meta.largest, - t.min_sequence, t.max_sequence, - t.meta.marked_for_compaction); - } - - //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); - { - unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options)); - log::Writer log(std::move(file_writer)); - std::string record; - edit_->EncodeTo(&record); - status = log.AddRecord(record); - } - - if (!status.ok()) { - env_->DeleteFile(tmp); - } else { - // Discard older manifests - for (size_t i = 0; i < manifests_.size(); i++) { - ArchiveFile(dbname_ + "/" + manifests_[i]); + for (const auto* table : cf_id_and_tables.second) { + edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), + table->meta.fd.GetFileSize(), table->meta.smallest, + table->meta.largest, table->min_sequence, + table->max_sequence, table->meta.marked_for_compaction); } - - // Install new manifest - status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); - if (status.ok()) { - status = SetCurrentFile(env_, dbname_, 1, nullptr); - } else { - env_->DeleteFile(tmp); + mutex_.Lock(); + Status status = vset_.LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, + nullptr /* db_directory */, false /* new_descriptor_log */); + mutex_.Unlock(); + if (!status.ok()) { + return status; } } - return status; + return Status::OK(); } void ArchiveFile(const std::string& fname) { @@ -453,15 +558,60 @@ class Repairer { new_file.append("/"); new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); Status s = env_->RenameFile(fname, new_file); - Log(InfoLogLevel::INFO_LEVEL, - options_.info_log, "Archiving %s: %s\n", + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Archiving %s: %s\n", fname.c_str(), s.ToString().c_str()); } }; -} // namespace + +Status GetDefaultCFOptions( + const std::vector& column_families, + ColumnFamilyOptions* res) { + assert(res != nullptr); + auto iter = std::find_if(column_families.begin(), column_families.end(), + [](const ColumnFamilyDescriptor& cfd) { + return cfd.name == kDefaultColumnFamilyName; + }); + if (iter == column_families.end()) { + return Status::InvalidArgument( + "column_families", "Must contain entry for default column family"); + } + *res = iter->options; + return Status::OK(); +} +} // anonymous namespace + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (status.ok()) { + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + ColumnFamilyOptions() /* unknown_cf_opts */, + false /* create_unknown_cfs */); + status = repairer.Run(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (status.ok()) { + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + unknown_cf_opts, true /* create_unknown_cfs */); + status = repairer.Run(); + } + return status; +} Status RepairDB(const std::string& dbname, const Options& options) { - Repairer repairer(dbname, options); + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */, + cf_options /* unknown_cf_opts */, + true /* create_unknown_cfs */); return repairer.Run(); } diff --git a/external/rocksdb/db/repair_test.cc b/external/rocksdb/db/repair_test.cc new file mode 100644 index 0000000000..93e5113b35 --- /dev/null +++ b/external/rocksdb/db/repair_test.cc @@ -0,0 +1,276 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "db/db_impl.h" +#include "db/db_test_util.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/transaction_log.h" +#include "util/file_util.h" +#include "util/string_util.h" + +namespace rocksdb { + +class RepairTest : public DBTestBase { + public: + RepairTest() : DBTestBase("/repair_test") {} + + std::string GetFirstSstPath() { + uint64_t manifest_size; + std::vector files; + db_->GetLiveFiles(files, &manifest_size); + auto sst_iter = + std::find_if(files.begin(), files.end(), [](const std::string& file) { + uint64_t number; + FileType type; + bool ok = ParseFileName(file, &number, &type); + return ok && type == kTableFile; + }); + return sst_iter == files.end() ? "" : dbname_ + *sst_iter; + } +}; + +TEST_F(RepairTest, LostManifest) { + // Add a couple SST files, delete the manifest, and verify RepairDB() saves + // the day. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, CorruptManifest) { + // Manifest is in an invalid format. Expect a full recovery. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + // Need to get path before Close() deletes db_, but overwrite it after Close() + // to ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + CreateFile(env_, manifest_path, "blah"); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, IncompleteManifest) { + // In this case, the manifest is valid but does not reference all of the SST + // files. Expect a full recovery. + Put("key", "val"); + Flush(); + std::string orig_manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + CopyFile(orig_manifest_path, orig_manifest_path + ".tmp"); + Put("key2", "val2"); + Flush(); + // Need to get path before Close() deletes db_, but overwrite it after Close() + // to ensure Close() didn't change the manifest. + std::string new_manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(new_manifest_path)); + // Replace the manifest with one that is only aware of the first SST file. + CopyFile(orig_manifest_path + ".tmp", new_manifest_path); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, LostSst) { + // Delete one of the SST files but preserve the manifest that refers to it, + // then verify the DB is still usable for the intact SST. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + auto sst_path = GetFirstSstPath(); + ASSERT_FALSE(sst_path.empty()); + ASSERT_OK(env_->DeleteFile(sst_path)); + + Close(); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + // Exactly one of the key-value pairs should be in the DB now. + ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); +} + +TEST_F(RepairTest, CorruptSst) { + // Corrupt one of the SST files but preserve the manifest that refers to it, + // then verify the DB is still usable for the intact SST. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + auto sst_path = GetFirstSstPath(); + ASSERT_FALSE(sst_path.empty()); + CreateFile(env_, sst_path, "blah"); + + Close(); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + // Exactly one of the key-value pairs should be in the DB now. + ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); +} + +TEST_F(RepairTest, UnflushedSst) { + // This test case invokes repair while some data is unflushed, then verifies + // that data is in the db. + Put("key", "val"); + VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 1); + uint64_t total_ssts_size; + GetAllSSTFiles(&total_ssts_size); + ASSERT_EQ(total_ssts_size, 0); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 0); + GetAllSSTFiles(&total_ssts_size); + ASSERT_GT(total_ssts_size, 0); + ASSERT_EQ(Get("key"), "val"); +} + +TEST_F(RepairTest, RepairMultipleColumnFamilies) { + // Verify repair logic associates SST files with their original column + // families. + const int kNumCfs = 3; + const int kEntriesPerCf = 2; + DestroyAndReopen(CurrentOptions()); + CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions()); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + Put(i, "key" + ToString(j), "val" + ToString(j)); + if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) { + // Leave one unflushed so we can verify WAL entries are properly + // associated with column families. + continue; + } + Flush(i); + } + } + + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() doesn't re-create the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"}, + CurrentOptions()); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j)); + } + } +} + +TEST_F(RepairTest, RepairColumnFamilyOptions) { + // Verify repair logic uses correct ColumnFamilyOptions when repairing a + // database with different options for column families. + const int kNumCfs = 2; + const int kEntriesPerCf = 2; + + Options opts(CurrentOptions()), rev_opts(CurrentOptions()); + opts.comparator = BytewiseComparator(); + rev_opts.comparator = ReverseBytewiseComparator(); + + DestroyAndReopen(opts); + CreateColumnFamilies({"reverse"}, rev_opts); + ReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts}); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + Put(i, "key" + ToString(j), "val" + ToString(j)); + if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) { + // Leave one unflushed so we can verify RepairDB's flush logic + continue; + } + Flush(i); + } + } + Close(); + + // RepairDB() records the comparator in the manifest, and DB::Open would fail + // if a different comparator were used. + ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}}, + opts /* unknown_cf_opts */)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts})); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j)); + } + } + + // Examine table properties to verify RepairDB() used the right options when + // converting WAL->SST + TablePropertiesCollection fname_to_props; + db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props); + ASSERT_EQ(fname_to_props.size(), 2U); + for (const auto& fname_and_props : fname_to_props) { + ASSERT_EQ(InternalKeyComparator(rev_opts.comparator).Name(), + fname_and_props.second->comparator_name); + } + + // Also check comparator when it's provided via "unknown" CF options + ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}}, + rev_opts /* unknown_cf_opts */)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts})); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j)); + } + } +} +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/external/rocksdb/db/skiplist.h b/external/rocksdb/db/skiplist.h index 787fad59d5..3fdbd8f545 100644 --- a/external/rocksdb/db/skiplist.h +++ b/external/rocksdb/db/skiplist.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -107,8 +107,9 @@ class SkipList { }; private: - const int32_t kMaxHeight_; - const int32_t kBranching_; + const uint16_t kMaxHeight_; + const uint16_t kBranching_; + const uint32_t kScaledInverseBranching_; // Immutable after construction Comparator const compare_; @@ -131,9 +132,6 @@ class SkipList { return max_height_.load(std::memory_order_relaxed); } - // Read/written only by Insert(). - Random rnd_; - Node* NewNode(const Key& key, int height); int RandomHeight(); bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } @@ -264,9 +262,11 @@ inline void SkipList::Iterator::SeekToLast() { template int SkipList::RandomHeight() { + auto rnd = Random::GetTLSInstance(); + // Increase height with probability 1 in kBranching int height = 1; - while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) { + while (height < kMaxHeight_ && rnd->Next() < kScaledInverseBranching_) { height++; } assert(height > 0); @@ -391,14 +391,16 @@ SkipList::SkipList(const Comparator cmp, Allocator* allocator, int32_t branching_factor) : kMaxHeight_(max_height), kBranching_(branching_factor), + kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), compare_(cmp), allocator_(allocator), head_(NewNode(0 /* any key will do */, max_height)), max_height_(1), - prev_height_(1), - rnd_(0xdeadbeef) { - assert(kMaxHeight_ > 0); - assert(kBranching_ > 0); + prev_height_(1) { + assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); + assert(branching_factor > 0 && + kBranching_ == static_cast(branching_factor)); + assert(kScaledInverseBranching_ > 0); // Allocate the prev_ Node* array, directly from the passed-in allocator. // prev_ does not need to be freed, as its life cycle is tied up with // the allocator as a whole. diff --git a/external/rocksdb/db/skiplist_test.cc b/external/rocksdb/db/skiplist_test.cc index 3d14186258..b4f98e34ca 100644 --- a/external/rocksdb/db/skiplist_test.cc +++ b/external/rocksdb/db/skiplist_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/slice.cc b/external/rocksdb/db/slice.cc deleted file mode 100644 index 7e7245d795..0000000000 --- a/external/rocksdb/db/slice.cc +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "rocksdb/slice.h" - -namespace rocksdb { - -Slice::Slice(const SliceParts& parts, std::string* buf) { - size_t length = 0; - for (int i = 0; i < parts.num_parts; ++i) { - length += parts.parts[i].size(); - } - buf->reserve(length); - - for (int i = 0; i < parts.num_parts; ++i) { - buf->append(parts.parts[i].data(), parts.parts[i].size()); - } - data_ = buf->data(); - size_ = buf->size(); -} - -} // namespace rocksdb diff --git a/external/rocksdb/db/snapshot_impl.cc b/external/rocksdb/db/snapshot_impl.cc index 1546d68f69..5c4f6abaa8 100644 --- a/external/rocksdb/db/snapshot_impl.cc +++ b/external/rocksdb/db/snapshot_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,9 @@ namespace rocksdb { ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db), snapshot_(db->GetSnapshot()) {} +ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot) + : db_(db), snapshot_(_snapshot) {} + ManagedSnapshot::~ManagedSnapshot() { if (snapshot_) { db_->ReleaseSnapshot(snapshot_); diff --git a/external/rocksdb/db/snapshot_impl.h b/external/rocksdb/db/snapshot_impl.h index b4d58fdf01..aaac7a0e3a 100644 --- a/external/rocksdb/db/snapshot_impl.h +++ b/external/rocksdb/db/snapshot_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -34,6 +34,9 @@ class SnapshotImpl : public Snapshot { SnapshotList* list_; // just for sanity checks int64_t unix_time_; + + // Will this snapshot be used by a Transaction to do write-conflict checking? + bool is_write_conflict_boundary_; }; class SnapshotList { @@ -50,9 +53,10 @@ class SnapshotList { SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } const SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, - uint64_t unix_time) { + uint64_t unix_time, bool is_write_conflict_boundary) { s->number_ = seq; s->unix_time_ = unix_time; + s->is_write_conflict_boundary_ = is_write_conflict_boundary; s->list_ = this; s->next_ = &list_; s->prev_ = list_.prev_; @@ -71,14 +75,29 @@ class SnapshotList { } // retrieve all snapshot numbers. They are sorted in ascending order. - std::vector GetAll() { + std::vector GetAll( + SequenceNumber* oldest_write_conflict_snapshot = nullptr) { std::vector ret; + + if (oldest_write_conflict_snapshot != nullptr) { + *oldest_write_conflict_snapshot = kMaxSequenceNumber; + } + if (empty()) { return ret; } SnapshotImpl* s = &list_; while (s->next_ != &list_) { ret.push_back(s->next_->number_); + + if (oldest_write_conflict_snapshot != nullptr && + *oldest_write_conflict_snapshot == kMaxSequenceNumber && + s->next_->is_write_conflict_boundary_) { + // If this is the first write-conflict boundary snapshot in the list, + // it is the oldest + *oldest_write_conflict_snapshot = s->next_->number_; + } + s = s->next_; } return ret; diff --git a/external/rocksdb/db/table_cache.cc b/external/rocksdb/db/table_cache.cc index b240fc7d0f..ff79975667 100644 --- a/external/rocksdb/db/table_cache.cc +++ b/external/rocksdb/db/table_cache.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,6 +14,7 @@ #include "db/version_edit.h" #include "rocksdb/statistics.h" +#include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "table/table_builder.h" #include "table/table_reader.h" @@ -86,18 +87,19 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { Status TableCache::GetTableReader( const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, - bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, - unique_ptr* table_reader) { + bool sequential_mode, size_t readahead, bool record_read_stats, + HistogramImpl* file_read_hist, unique_ptr* table_reader, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache) { std::string fname = TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); unique_ptr file; Status s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options); - if (sequential_mode && ioptions_.compaction_readahead_size > 0) { - file = NewReadaheadRandomAccessFile(std::move(file), - ioptions_.compaction_readahead_size); - } + RecordTick(ioptions_.statistics, NO_FILE_OPENS); if (s.ok()) { + if (readahead > 0) { + file = NewReadaheadRandomAccessFile(std::move(file), readahead); + } if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } @@ -107,18 +109,29 @@ Status TableCache::GetTableReader( ioptions_.statistics, record_read_stats, file_read_hist)); s = ioptions_.table_factory->NewTableReader( - TableReaderOptions(ioptions_, env_options, internal_comparator), - std::move(file_reader), fd.GetFileSize(), table_reader); + TableReaderOptions(ioptions_, env_options, internal_comparator, + skip_filters, level), + std::move(file_reader), fd.GetFileSize(), table_reader, + prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); } return s; } +void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) { + ReleaseHandle(handle); + uint64_t number = fd.GetNumber(); + Slice key = GetSliceForFileNumber(&number); + cache_->Erase(key); +} + Status TableCache::FindTable(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, const bool no_io, bool record_read_stats, - HistogramImpl* file_read_hist) { + HistogramImpl* file_read_hist, bool skip_filters, + int level, + bool prefetch_index_and_filter_in_cache) { PERF_TIMER_GUARD(find_table_nanos); Status s; uint64_t number = fd.GetNumber(); @@ -133,28 +146,31 @@ Status TableCache::FindTable(const EnvOptions& env_options, } unique_ptr table_reader; s = GetTableReader(env_options, internal_comparator, fd, - false /* sequential mode */, record_read_stats, - file_read_hist, &table_reader); + false /* sequential mode */, 0 /* readahead */, + record_read_stats, file_read_hist, &table_reader, + skip_filters, level, prefetch_index_and_filter_in_cache); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.statistics, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { - *handle = cache_->Insert(key, table_reader.release(), 1, - &DeleteEntry); + s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry, + handle); + if (s.ok()) { + // Release ownership of table reader. + table_reader.release(); + } } } return s; } -Iterator* TableCache::NewIterator(const ReadOptions& options, - const EnvOptions& env_options, - const InternalKeyComparator& icomparator, - const FileDescriptor& fd, - TableReader** table_reader_ptr, - HistogramImpl* file_read_hist, - bool for_compaction, Arena* arena) { +InternalIterator* TableCache::NewIterator( + const ReadOptions& options, const EnvOptions& env_options, + const InternalKeyComparator& icomparator, const FileDescriptor& fd, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + bool for_compaction, Arena* arena, bool skip_filters, int level) { PERF_TIMER_GUARD(new_table_iterator_nanos); if (table_reader_ptr != nullptr) { @@ -163,32 +179,45 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, TableReader* table_reader = nullptr; Cache::Handle* handle = nullptr; - bool create_new_table_reader = - (for_compaction && ioptions_.new_table_reader_for_compaction_inputs); + + size_t readahead = 0; + bool create_new_table_reader = false; + if (for_compaction) { + if (ioptions_.new_table_reader_for_compaction_inputs) { + readahead = ioptions_.compaction_readahead_size; + create_new_table_reader = true; + } + } else { + readahead = options.readahead_size; + create_new_table_reader = readahead > 0; + } + if (create_new_table_reader) { unique_ptr table_reader_unique_ptr; Status s = GetTableReader( - env_options, icomparator, fd, /* sequential mode */ true, - /* record stats */ false, nullptr, &table_reader_unique_ptr); + env_options, icomparator, fd, true /* sequential_mode */, readahead, + !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr, + false /* skip_filters */, level); if (!s.ok()) { - return NewErrorIterator(s, arena); + return NewErrorInternalIterator(s, arena); } table_reader = table_reader_unique_ptr.release(); } else { table_reader = fd.table_reader; if (table_reader == nullptr) { - Status s = - FindTable(env_options, icomparator, fd, &handle, - options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record read_stats */, file_read_hist); + Status s = FindTable(env_options, icomparator, fd, &handle, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record read_stats */, + file_read_hist, skip_filters, level); if (!s.ok()) { - return NewErrorIterator(s, arena); + return NewErrorInternalIterator(s, arena); } table_reader = GetTableReaderFromHandle(handle); } } - Iterator* result = table_reader->NewIterator(options, arena); + InternalIterator* result = + table_reader->NewIterator(options, arena, skip_filters); if (create_new_table_reader) { assert(handle == nullptr); @@ -210,7 +239,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, const Slice& k, - GetContext* get_context, HistogramImpl* file_read_hist) { + GetContext* get_context, HistogramImpl* file_read_hist, + bool skip_filters, int level) { TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; @@ -220,7 +250,9 @@ Status TableCache::Get(const ReadOptions& options, IterKey row_cache_key; std::string row_cache_entry_buffer; - if (ioptions_.row_cache) { + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { uint64_t fd_number = fd.GetNumber(); auto user_key = ExtractUserKey(k); // We use the user key as cache key instead of the internal key, @@ -257,19 +289,20 @@ Status TableCache::Get(const ReadOptions& options, if (!t) { s = FindTable(env_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist); + true /* record_read_stats */, file_read_hist, skip_filters, + level); if (s.ok()) { t = GetTableReaderFromHandle(handle); } } if (s.ok()) { get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. - s = t->Get(options, k, get_context); + s = t->Get(options, k, get_context, skip_filters); get_context->SetReplayLog(nullptr); if (handle != nullptr) { ReleaseHandle(handle); } - } else if (options.read_tier && s.IsIncomplete()) { + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { // Couldn't find Table in cache but treat as kFound if no_io set get_context->MarkKeyMayExist(); return Status::OK(); @@ -281,9 +314,8 @@ Status TableCache::Get(const ReadOptions& options, size_t charge = row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string); void* row_ptr = new std::string(std::move(*row_cache_entry)); - auto row_handle = ioptions_.row_cache->Insert( - row_cache_key.GetKey(), row_ptr, charge, &DeleteEntry); - ioptions_.row_cache->Release(row_handle); + ioptions_.row_cache->Insert(row_cache_key.GetKey(), row_ptr, charge, + &DeleteEntry); } #endif // ROCKSDB_LITE diff --git a/external/rocksdb/db/table_cache.h b/external/rocksdb/db/table_cache.h index d9ae013485..cdd0f23dcb 100644 --- a/external/rocksdb/db/table_cache.h +++ b/external/rocksdb/db/table_cache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -29,6 +29,7 @@ class Arena; struct FileDescriptor; class GetContext; class HistogramImpl; +class InternalIterator; class TableCache { public: @@ -43,30 +44,43 @@ class TableCache { // the returned iterator. The returned "*tableptr" object is owned by // the cache and should not be deleted, and is valid for as long as the // returned iterator is live. - Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, - TableReader** table_reader_ptr = nullptr, - HistogramImpl* file_read_hist = nullptr, - bool for_compaction = false, Arena* arena = nullptr); + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + InternalIterator* NewIterator( + const ReadOptions& options, const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& file_fd, TableReader** table_reader_ptr = nullptr, + HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, + Arena* arena = nullptr, bool skip_filters = false, int level = -1); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until // it returns false. + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, const Slice& k, - GetContext* get_context, HistogramImpl* file_read_hist = nullptr); + GetContext* get_context, HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, int level = -1); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); + // Clean table handle and erase it from the table cache + // Used in DB close, or the file is not live anymore. + void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle); + // Find table reader + // @param skip_filters Disables loading/accessing the filter block + // @param level == -1 means not specified Status FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, Cache::Handle**, const bool no_io = false, bool record_read_stats = true, - HistogramImpl* file_read_hist = nullptr); + HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true); // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); @@ -98,8 +112,11 @@ class TableCache { Status GetTableReader(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, bool sequential_mode, - bool record_read_stats, HistogramImpl* file_read_hist, - unique_ptr* table_reader); + size_t readahead, bool record_read_stats, + HistogramImpl* file_read_hist, + unique_ptr* table_reader, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true); const ImmutableCFOptions& ioptions_; const EnvOptions& env_options_; diff --git a/external/rocksdb/db/table_properties_collector.cc b/external/rocksdb/db/table_properties_collector.cc index c14ecec11e..c5ec4754a2 100644 --- a/external/rocksdb/db/table_properties_collector.cc +++ b/external/rocksdb/db/table_properties_collector.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,6 +23,8 @@ Status InternalKeyPropertiesCollector::InternalAdd(const Slice& key, if (ikey.type == ValueType::kTypeDeletion || ikey.type == ValueType::kTypeSingleDeletion) { ++deleted_keys_; + } else if (ikey.type == ValueType::kTypeMerge) { + ++merge_operands_; } return Status::OK(); @@ -33,19 +35,26 @@ Status InternalKeyPropertiesCollector::Finish( assert(properties); assert(properties->find( InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end()); - std::string val; + assert(properties->find(InternalKeyTablePropertiesNames::kMergeOperands) == + properties->end()); - PutVarint64(&val, deleted_keys_); - properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val }); + std::string val_deleted_keys; + PutVarint64(&val_deleted_keys, deleted_keys_); + properties->insert( + {InternalKeyTablePropertiesNames::kDeletedKeys, val_deleted_keys}); + + std::string val_merge_operands; + PutVarint64(&val_merge_operands, merge_operands_); + properties->insert( + {InternalKeyTablePropertiesNames::kMergeOperands, val_merge_operands}); return Status::OK(); } UserCollectedProperties InternalKeyPropertiesCollector::GetReadableProperties() const { - return { - { "kDeletedKeys", ToString(deleted_keys_) } - }; + return {{"kDeletedKeys", ToString(deleted_keys_)}, + {"kMergeOperands", ToString(merge_operands_)}}; } namespace { @@ -65,6 +74,20 @@ EntryType GetEntryType(ValueType value_type) { } } +uint64_t GetUint64Property(const UserCollectedProperties& props, + const std::string property_name, + bool* property_present) { + auto pos = props.find(property_name); + if (pos == props.end()) { + *property_present = false; + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + *property_present = true; + return GetVarint64(&raw, &val) ? val : 0; +} + } // namespace Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, @@ -92,16 +115,20 @@ UserKeyTablePropertiesCollector::GetReadableProperties() const { const std::string InternalKeyTablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; +const std::string InternalKeyTablePropertiesNames::kMergeOperands = + "rocksdb.merge.operands"; uint64_t GetDeletedKeys( const UserCollectedProperties& props) { - auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys); - if (pos == props.end()) { - return 0; - } - Slice raw = pos->second; - uint64_t val = 0; - return GetVarint64(&raw, &val) ? val : 0; + bool property_present_ignored; + return GetUint64Property(props, InternalKeyTablePropertiesNames::kDeletedKeys, + &property_present_ignored); +} + +uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present) { + return GetUint64Property( + props, InternalKeyTablePropertiesNames::kMergeOperands, property_present); } } // namespace rocksdb diff --git a/external/rocksdb/db/table_properties_collector.h b/external/rocksdb/db/table_properties_collector.h index 51c2ba9151..b28cbfc006 100644 --- a/external/rocksdb/db/table_properties_collector.h +++ b/external/rocksdb/db/table_properties_collector.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,6 +16,7 @@ namespace rocksdb { struct InternalKeyTablePropertiesNames { static const std::string kDeletedKeys; + static const std::string kMergeOperands; }; // Base class for internal table properties collector. @@ -41,7 +42,8 @@ class IntTblPropCollectorFactory { public: virtual ~IntTblPropCollectorFactory() {} // has to be thread-safe - virtual IntTblPropCollector* CreateIntTblPropCollector() = 0; + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) = 0; // The name of the properties collector can be used for debugging purpose. virtual const char* Name() const = 0; @@ -64,12 +66,14 @@ class InternalKeyPropertiesCollector : public IntTblPropCollector { private: uint64_t deleted_keys_ = 0; + uint64_t merge_operands_ = 0; }; class InternalKeyPropertiesCollectorFactory : public IntTblPropCollectorFactory { public: - virtual IntTblPropCollector* CreateIntTblPropCollector() override { + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) override { return new InternalKeyPropertiesCollector(); } @@ -114,9 +118,12 @@ class UserKeyTablePropertiesCollectorFactory explicit UserKeyTablePropertiesCollectorFactory( std::shared_ptr user_collector_factory) : user_collector_factory_(user_collector_factory) {} - virtual IntTblPropCollector* CreateIntTblPropCollector() override { + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) override { + TablePropertiesCollectorFactory::Context context; + context.column_family_id = column_family_id; return new UserKeyTablePropertiesCollector( - user_collector_factory_->CreateTablePropertiesCollector()); + user_collector_factory_->CreateTablePropertiesCollector(context)); } virtual const char* Name() const override { diff --git a/external/rocksdb/db/table_properties_collector_test.cc b/external/rocksdb/db/table_properties_collector_test.cc index 0eeed81912..2033172673 100644 --- a/external/rocksdb/db/table_properties_collector_test.cc +++ b/external/rocksdb/db/table_properties_collector_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -35,6 +35,9 @@ class TablePropertiesTest : public testing::Test, // Utilities test functions namespace { +static const uint32_t kTestColumnFamilyId = 66; +static const std::string kTestColumnFamilyName = "test_column_fam"; + void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, const std::vector>* @@ -46,6 +49,7 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, builder->reset(NewTableBuilder( ioptions, internal_comparator, int_tbl_prop_collector_factories, + kTestColumnFamilyId, kTestColumnFamilyName, writable->get(), options.compression, options.compression_opts)); } } // namespace @@ -178,14 +182,17 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, public: explicit RegularKeysStartWithAFactory(bool backward_mode) : backward_mode_(backward_mode) {} - virtual TablePropertiesCollector* CreateTablePropertiesCollector() override { + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(kTestColumnFamilyId, context.column_family_id); if (!backward_mode_) { return new RegularKeysStartWithA(); } else { return new RegularKeysStartWithABackwardCompatible(); } } - virtual IntTblPropCollector* CreateIntTblPropCollector() override { + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) override { return new RegularKeysStartWithAInternal(); } const char* Name() const override { return "RegularKeysStartWithA"; } @@ -269,7 +276,7 @@ void TestCustomizedTablePropertiesCollector( new test::StringSource(fwf->contents()))); TableProperties* props; Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), - magic_number, Env::Default(), nullptr, &props); + magic_number, ioptions, &props); std::unique_ptr props_guard(props); ASSERT_OK(s); @@ -361,6 +368,8 @@ void TestInternalKeyPropertiesCollector( InternalKey("Y ", 5, ValueType::kTypeDeletion), InternalKey("Z ", 6, ValueType::kTypeDeletion), InternalKey("a ", 7, ValueType::kTypeSingleDeletion), + InternalKey("b ", 8, ValueType::kTypeMerge), + InternalKey("c ", 9, ValueType::kTypeMerge), }; std::unique_ptr builder; @@ -408,7 +417,7 @@ void TestInternalKeyPropertiesCollector( TableProperties* props; Status s = ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, - Env::Default(), nullptr, &props); + ioptions, &props); ASSERT_OK(s); std::unique_ptr props_guard(props); @@ -416,6 +425,11 @@ void TestInternalKeyPropertiesCollector( uint64_t deleted = GetDeletedKeys(user_collected); ASSERT_EQ(5u, deleted); // deletes + single-deletes + bool property_present; + uint64_t merges = GetMergeOperands(user_collected, &property_present); + ASSERT_TRUE(property_present); + ASSERT_EQ(2u, merges); + if (sanitized) { uint32_t starts_with_A = 0; ASSERT_NE(user_collected.find("Count"), user_collected.end()); diff --git a/external/rocksdb/db/transaction_log_impl.cc b/external/rocksdb/db/transaction_log_impl.cc index 23bd6672b9..91ed054127 100644 --- a/external/rocksdb/db/transaction_log_impl.cc +++ b/external/rocksdb/db/transaction_log_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,8 +8,8 @@ #define __STDC_FORMAT_MACROS #endif -#include #include "db/transaction_log_impl.h" +#include #include "db/write_batch_internal.h" #include "util/file_reader_writer.h" @@ -107,7 +107,7 @@ void TransactionLogIteratorImpl::SeekToStartSequence( return; } while (RestrictedRead(&record, &scratch)) { - if (record.size() < 12) { + if (record.size() < WriteBatchInternal::kHeader) { reporter_.Corruption( record.size(), Status::Corruption("very small log record")); continue; @@ -167,7 +167,7 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) { currentLogReader_->UnmarkEOF(); } while (RestrictedRead(&record, &scratch)) { - if (record.size() < 12) { + if (record.size() < WriteBatchInternal::kHeader) { reporter_.Corruption( record.size(), Status::Corruption("very small log record")); continue; @@ -250,7 +250,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { // currentBatchSeq_ can only change here assert(currentLastSeq_ <= versions_->LastSequence()); - currentBatch_ = move(batch); + currentBatch_ = std::move(batch); isValid_ = true; currentStatus_ = Status::OK(); } @@ -262,8 +262,9 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { return s; } assert(file); - currentLogReader_.reset(new log::Reader(std::move(file), &reporter_, - read_options_.verify_checksums_, 0)); + currentLogReader_.reset(new log::Reader( + options_->info_log, std::move(file), &reporter_, + read_options_.verify_checksums_, 0, logFile->LogNumber())); return Status::OK(); } } // namespace rocksdb diff --git a/external/rocksdb/db/transaction_log_impl.h b/external/rocksdb/db/transaction_log_impl.h index f89cc32070..d4a2468e7f 100644 --- a/external/rocksdb/db/transaction_log_impl.h +++ b/external/rocksdb/db/transaction_log_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/version_builder.cc b/external/rocksdb/db/version_builder.cc index 7444bfc5c6..2837686be5 100644 --- a/external/rocksdb/db/version_builder.cc +++ b/external/rocksdb/db/version_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -82,6 +82,7 @@ class VersionBuilder::Rep { }; const EnvOptions& env_options_; + Logger* info_log_; TableCache* table_cache_; VersionStorageInfo* base_vstorage_; LevelState* levels_; @@ -89,9 +90,10 @@ class VersionBuilder::Rep { FileComparator level_nonzero_cmp_; public: - Rep(const EnvOptions& env_options, TableCache* table_cache, + Rep(const EnvOptions& env_options, Logger* info_log, TableCache* table_cache, VersionStorageInfo* base_vstorage) : env_options_(env_options), + info_log_(info_log), table_cache_(table_cache), base_vstorage_(base_vstorage) { levels_ = new LevelState[base_vstorage_->num_levels()]; @@ -134,7 +136,10 @@ class VersionBuilder::Rep { auto f2 = level_files[i]; if (level == 0) { assert(level_zero_cmp_(f1, f2)); - assert(f1->largest_seqno > f2->largest_seqno); + assert(f1->largest_seqno > f2->largest_seqno || + // We can have multiple files with seqno = 0 as a result of + // using DB::AddFile() + (f1->largest_seqno == 0 && f2->largest_seqno == 0)); } else { assert(level_nonzero_cmp_(f1, f2)); @@ -160,7 +165,7 @@ class VersionBuilder::Rep { for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) { const std::vector& base_files = base_vstorage_->LevelFiles(l); - for (unsigned int i = 0; i < base_files.size(); i++) { + for (size_t i = 0; i < base_files.size(); i++) { FileMetaData* f = base_files[i]; if (f->fd.GetNumber() == number) { found = true; @@ -282,7 +287,8 @@ class VersionBuilder::Rep { CheckConsistency(vstorage); } - void LoadTableHandlers(InternalStats* internal_stats, int max_threads) { + void LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache) { assert(table_cache_ != nullptr); // std::vector> files_meta; @@ -308,7 +314,8 @@ class VersionBuilder::Rep { *(base_vstorage_->InternalComparator()), file_meta->fd, &file_meta->table_reader_handle, false /*no_io */, true /* record_read_stats */, - internal_stats->GetFileReadHist(level)); + internal_stats->GetFileReadHist(level), false, + level, prefetch_index_and_filter_in_cache); if (file_meta->table_reader_handle != nullptr) { // Load table_reader file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( @@ -333,17 +340,19 @@ class VersionBuilder::Rep { void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { - // File is deleted: do nothing + // f is to-be-delected table file + vstorage->RemoveCurrentStats(f); } else { - vstorage->AddFile(level, f); + vstorage->AddFile(level, f, info_log_); } } }; VersionBuilder::VersionBuilder(const EnvOptions& env_options, TableCache* table_cache, - VersionStorageInfo* base_vstorage) - : rep_(new Rep(env_options, table_cache, base_vstorage)) {} + VersionStorageInfo* base_vstorage, + Logger* info_log) + : rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {} VersionBuilder::~VersionBuilder() { delete rep_; } void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { rep_->CheckConsistency(vstorage); @@ -356,9 +365,11 @@ void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); } void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { rep_->SaveTo(vstorage); } -void VersionBuilder::LoadTableHandlers(InternalStats* internal_stats, - int max_threads) { - rep_->LoadTableHandlers(internal_stats, max_threads); +void VersionBuilder::LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache) { + rep_->LoadTableHandlers(internal_stats, max_threads, + prefetch_index_and_filter_in_cache); } void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { diff --git a/external/rocksdb/db/version_builder.h b/external/rocksdb/db/version_builder.h index c7ef2796c5..44ff75939b 100644 --- a/external/rocksdb/db/version_builder.h +++ b/external/rocksdb/db/version_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,14 +24,15 @@ class InternalStats; class VersionBuilder { public: VersionBuilder(const EnvOptions& env_options, TableCache* table_cache, - VersionStorageInfo* base_vstorage); + VersionStorageInfo* base_vstorage, Logger* info_log = nullptr); ~VersionBuilder(); void CheckConsistency(VersionStorageInfo* vstorage); void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, int level); void Apply(VersionEdit* edit); void SaveTo(VersionStorageInfo* vstorage); - void LoadTableHandlers(InternalStats* internal_stats, int max_threads = 1); + void LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache); void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f); private: diff --git a/external/rocksdb/db/version_builder_test.cc b/external/rocksdb/db/version_builder_test.cc index 66230eef40..2a87dc2380 100644 --- a/external/rocksdb/db/version_builder_test.cc +++ b/external/rocksdb/db/version_builder_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/version_edit.cc b/external/rocksdb/db/version_edit.cc index 0c9efe4198..a030b67934 100644 --- a/external/rocksdb/db/version_edit.cc +++ b/external/rocksdb/db/version_edit.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,7 @@ #include "db/version_set.h" #include "util/coding.h" #include "util/event_logger.h" +#include "util/sync_point.h" #include "rocksdb/slice.h" namespace rocksdb { @@ -32,12 +33,22 @@ enum Tag { // these are new formats divergent from open source leveldb kNewFile2 = 100, kNewFile3 = 102, + kNewFile4 = 103, // 4th (the latest) format version of adding files kColumnFamily = 200, // specify column family for version edit kColumnFamilyAdd = 201, kColumnFamilyDrop = 202, kMaxColumnFamily = 203, }; +enum CustomTag { + kTerminate = 1, // The end of customized fields + kNeedCompaction = 2, + kPathId = 65, +}; +// If this bit for the custom tag is set, opening DB should fail if +// we don't know this field. +uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6; + uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { assert(number <= kFileNumberMask); return number | (path_id * (kFileNumberMask + 1)); @@ -71,30 +82,24 @@ bool VersionEdit::EncodeTo(std::string* dst) const { PutLengthPrefixedSlice(dst, comparator_); } if (has_log_number_) { - PutVarint32(dst, kLogNumber); - PutVarint64(dst, log_number_); + PutVarint32Varint64(dst, kLogNumber, log_number_); } if (has_prev_log_number_) { - PutVarint32(dst, kPrevLogNumber); - PutVarint64(dst, prev_log_number_); + PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_); } if (has_next_file_number_) { - PutVarint32(dst, kNextFileNumber); - PutVarint64(dst, next_file_number_); + PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); } if (has_last_sequence_) { - PutVarint32(dst, kLastSequence); - PutVarint64(dst, last_sequence_); + PutVarint32Varint64(dst, kLastSequence, last_sequence_); } if (has_max_column_family_) { - PutVarint32(dst, kMaxColumnFamily); - PutVarint32(dst, max_column_family_); + PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_); } for (const auto& deleted : deleted_files_) { - PutVarint32(dst, kDeletedFile); - PutVarint32(dst, deleted.first /* level */); - PutVarint64(dst, deleted.second /* file number */); + PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */, + deleted.second /* file number */); } for (size_t i = 0; i < new_files_.size(); i++) { @@ -102,29 +107,73 @@ bool VersionEdit::EncodeTo(std::string* dst) const { if (!f.smallest.Valid() || !f.largest.Valid()) { return false; } - if (f.fd.GetPathId() == 0) { + bool has_customized_fields = false; + if (f.marked_for_compaction) { + PutVarint32(dst, kNewFile4); + has_customized_fields = true; + } else if (f.fd.GetPathId() == 0) { // Use older format to make sure user can roll back the build if they // don't config multiple DB paths. PutVarint32(dst, kNewFile2); } else { PutVarint32(dst, kNewFile3); } - PutVarint32(dst, new_files_[i].first); // level - PutVarint64(dst, f.fd.GetNumber()); - if (f.fd.GetPathId() != 0) { + PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); + if (f.fd.GetPathId() != 0 && !has_customized_fields) { + // kNewFile3 PutVarint32(dst, f.fd.GetPathId()); } PutVarint64(dst, f.fd.GetFileSize()); PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); - PutVarint64(dst, f.smallest_seqno); - PutVarint64(dst, f.largest_seqno); + PutVarint64Varint64(dst, f.smallest_seqno, f.largest_seqno); + if (has_customized_fields) { + // Customized fields' format: + // +-----------------------------+ + // | 1st field's tag (varint32) | + // +-----------------------------+ + // | 1st field's size (varint32) | + // +-----------------------------+ + // | bytes for 1st field | + // | (based on size decoded) | + // +-----------------------------+ + // | | + // | ...... | + // | | + // +-----------------------------+ + // | last field's size (varint32)| + // +-----------------------------+ + // | bytes for last field | + // | (based on size decoded) | + // +-----------------------------+ + // | terminating tag (varint32) | + // +-----------------------------+ + // + // Customized encoding for fields: + // tag kPathId: 1 byte as path_id + // tag kNeedCompaction: + // now only can take one char value 1 indicating need-compaction + // + if (f.fd.GetPathId() != 0) { + PutVarint32(dst, CustomTag::kPathId); + char p = static_cast(f.fd.GetPathId()); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.marked_for_compaction) { + PutVarint32(dst, CustomTag::kNeedCompaction); + char p = static_cast(1); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", + dst); + + PutVarint32(dst, CustomTag::kTerminate); + } } // 0 is default and does not need to be explicitly written if (column_family_ != 0) { - PutVarint32(dst, kColumnFamily); - PutVarint32(dst, column_family_); + PutVarint32Varint32(dst, kColumnFamily, column_family_); } if (is_column_family_add_) { @@ -161,6 +210,63 @@ bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { } } +const char* VersionEdit::DecodeNewFile4From(Slice* input) { + const char* msg = nullptr; + int level; + FileMetaData f; + uint64_t number; + uint32_t path_id = 0; + uint64_t file_size; + if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && + GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && + GetInternalKey(input, &f.largest) && + GetVarint64(input, &f.smallest_seqno) && + GetVarint64(input, &f.largest_seqno)) { + // See comments in VersionEdit::EncodeTo() for format of customized fields + while (true) { + uint32_t custom_tag; + Slice field; + if (!GetVarint32(input, &custom_tag)) { + return "new-file4 custom field"; + } + if (custom_tag == kTerminate) { + break; + } + if (!GetLengthPrefixedSlice(input, &field)) { + return "new-file4 custom field lenth prefixed slice error"; + } + switch (custom_tag) { + case kPathId: + if (field.size() != 1) { + return "path_id field wrong size"; + } + path_id = field[0]; + if (path_id > 3) { + return "path_id wrong vaue"; + } + break; + case kNeedCompaction: + if (field.size() != 1) { + return "need_compaction field wrong size"; + } + f.marked_for_compaction = (field[0] == 1); + break; + default: + if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { + // Should not proceed if cannot understand it + return "new-file4 custom field not supported"; + } + break; + } + } + } else { + return "new-file4 entry"; + } + f.fd = FileDescriptor(number, path_id, file_size); + new_files_.push_back(std::make_pair(level, f)); + return nullptr; +} + Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); Slice input = src; @@ -304,6 +410,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; } + case kNewFile4: { + msg = DecodeNewFile4From(&input); + break; + } + case kColumnFamily: if (!GetVarint32(&input, &column_family_)) { if (!msg) { diff --git a/external/rocksdb/db/version_edit.h b/external/rocksdb/db/version_edit.h index 5c558409aa..6bd5b23a1a 100644 --- a/external/rocksdb/db/version_edit.h +++ b/external/rocksdb/db/version_edit.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -55,7 +55,8 @@ struct FileDescriptor { return packed_number_and_path_id & kFileNumberMask; } uint32_t GetPathId() const { - return packed_number_and_path_id / (kFileNumberMask + 1); + return static_cast( + packed_number_and_path_id / (kFileNumberMask + 1)); } uint64_t GetFileSize() const { return file_size; } }; @@ -192,7 +193,7 @@ class VersionEdit { f.smallest_seqno = smallest_seqno; f.largest_seqno = largest_seqno; f.marked_for_compaction = marked_for_compaction; - new_files_.emplace_back(level, f); + new_files_.emplace_back(level, std::move(f)); } void AddFile(int level, const FileMetaData& f) { @@ -237,6 +238,8 @@ class VersionEdit { bool EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); + const char* DecodeNewFile4From(Slice* input); + typedef std::set> DeletedFileSet; const DeletedFileSet& GetDeletedFiles() { return deleted_files_; } diff --git a/external/rocksdb/db/version_edit_test.cc b/external/rocksdb/db/version_edit_test.cc index 4186e08e6e..ab109be600 100644 --- a/external/rocksdb/db/version_edit_test.cc +++ b/external/rocksdb/db/version_edit_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_edit.h" +#include "util/sync_point.h" #include "util/testharness.h" namespace rocksdb { @@ -45,6 +46,121 @@ TEST_F(VersionEditTest, EncodeDecode) { TestEncodeDecode(edit); } +TEST_F(VersionEditTest, EncodeDecodeNewFile4) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true); + edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), + InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, + kBig + 601, false); + edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), + InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, + kBig + 602, true); + + edit.DeleteFile(4, 700); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); + + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files[0].second.marked_for_compaction); + ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_TRUE(new_files[2].second.marked_for_compaction); + ASSERT_EQ(3, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(0, new_files[2].second.fd.GetPathId()); +} + +TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { + static const uint64_t kBig = 1ull << 50; + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true); + edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), + InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, + kBig + 601, false); + edit.DeleteFile(4, 700); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + + std::string encoded; + + // Call back function to add extra customized builds. + bool first = true; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) { + std::string* str = reinterpret_cast(arg); + PutVarint32(str, 33); + const std::string str1 = "random_string"; + PutLengthPrefixedSlice(str, str1); + if (first) { + first = false; + PutVarint32(str, 22); + const std::string str2 = "s"; + PutLengthPrefixedSlice(str, str2); + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + edit.EncodeTo(&encoded); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_TRUE(!first); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files[0].second.marked_for_compaction); + ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_EQ(3, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(1u, parsed.GetDeletedFiles().size()); +} + +TEST_F(VersionEditTest, NewFile4NotSupportedField) { + static const uint64_t kBig = 1ull << 50; + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + + std::string encoded; + + // Call back function to add extra customized builds. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) { + std::string* str = reinterpret_cast(arg); + const std::string str1 = "s"; + PutLengthPrefixedSlice(str, str1); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + edit.EncodeTo(&encoded); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_NOK(s); +} + TEST_F(VersionEditTest, EncodeEmptyFile) { VersionEdit edit; edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false); diff --git a/external/rocksdb/db/version_set.cc b/external/rocksdb/db/version_set.cc index 91471c49d1..8bd6890ac2 100644 --- a/external/rocksdb/db/version_set.cc +++ b/external/rocksdb/db/version_set.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,28 +23,32 @@ #include #include +#include "db/compaction.h" #include "db/filename.h" #include "db/internal_stats.h" #include "db/log_reader.h" #include "db/log_writer.h" #include "db/memtable.h" #include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" #include "db/table_cache.h" -#include "db/compaction.h" #include "db/version_builder.h" -#include "db/writebuffer.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" -#include "table/table_reader.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" +#include "rocksdb/write_buffer_manager.h" #include "table/format.h" -#include "table/plain_table_factory.h" -#include "table/meta_blocks.h" #include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/merger.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/logging.h" +#include "util/perf_context_imp.h" #include "util/stop_watch.h" #include "util/sync_point.h" @@ -83,24 +87,22 @@ int FindFileInRange(const InternalKeyComparator& icmp, // are MergeInProgress). class FilePicker { public: - FilePicker( - std::vector* files, - const Slice& user_key, - const Slice& ikey, - autovector* file_levels, - unsigned int num_levels, - FileIndexer* file_indexer, - const Comparator* user_comparator, - const InternalKeyComparator* internal_comparator) + FilePicker(std::vector* files, const Slice& user_key, + const Slice& ikey, autovector* file_levels, + unsigned int num_levels, FileIndexer* file_indexer, + const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) : num_levels_(num_levels), - curr_level_(-1), - hit_file_level_(-1), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), search_left_bound_(0), search_right_bound_(FileIndexer::kLevelMaxIndex), #ifndef NDEBUG files_(files), #endif level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), user_key_(user_key), ikey_(ikey), file_indexer_(file_indexer), @@ -119,12 +121,16 @@ class FilePicker { } } + int GetCurrentLevel() { return returned_file_level_; } + FdWithKeyRange* GetNextFile() { while (!search_ended_) { // Loops over different levels. while (curr_index_in_curr_level_ < curr_file_level_->num_files) { // Loops over all files in current level. FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_index_in_curr_level_ == curr_file_level_->num_files - 1; int cmp_largest = -1; // Do key range filtering of files or/and fractional cascading if: @@ -189,6 +195,7 @@ class FilePicker { } prev_file_ = f; #endif + returned_file_level_ = curr_level_; if (curr_level_ > 0 && cmp_largest < 0) { // No more files to search in this level. search_ended_ = !PrepareNextLevel(); @@ -208,9 +215,14 @@ class FilePicker { // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts unsigned int GetHitFileLevel() { return hit_file_level_; } + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + private: unsigned int num_levels_; unsigned int curr_level_; + unsigned int returned_file_level_; unsigned int hit_file_level_; int32_t search_left_bound_; int32_t search_right_bound_; @@ -219,6 +231,7 @@ class FilePicker { #endif autovector* level_files_brief_; bool search_ended_; + bool is_hit_file_last_in_level_; LevelFilesBrief* curr_file_level_; unsigned int curr_index_in_curr_level_; unsigned int start_index_in_curr_level_; @@ -316,7 +329,7 @@ Version::~Version() { f->refs--; if (f->refs <= 0) { if (f->table_reader_handle) { - cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); + cfd_->table_cache()->EraseHandle(f->fd, f->table_reader_handle); f->table_reader_handle = nullptr; } vset_->obsolete_files_.push_back(f); @@ -420,7 +433,7 @@ namespace { // is the largest key that occurs in the file, and value() is an // 16-byte value containing the file number and file size, both // encoded using EncodeFixed64. -class LevelFileNumIterator : public Iterator { +class LevelFileNumIterator : public InternalIterator { public: LevelFileNumIterator(const InternalKeyComparator& icmp, const LevelFilesBrief* flevel) @@ -474,23 +487,26 @@ class LevelFileNumIterator : public Iterator { class LevelFileIteratorState : public TwoLevelIteratorState { public: + // @param skip_filters Disables loading/accessing the filter block LevelFileIteratorState(TableCache* table_cache, const ReadOptions& read_options, const EnvOptions& env_options, const InternalKeyComparator& icomparator, HistogramImpl* file_read_hist, bool for_compaction, - bool prefix_enabled) + bool prefix_enabled, bool skip_filters, int level) : TwoLevelIteratorState(prefix_enabled), table_cache_(table_cache), read_options_(read_options), env_options_(env_options), icomparator_(icomparator), file_read_hist_(file_read_hist), - for_compaction_(for_compaction) {} + for_compaction_(for_compaction), + skip_filters_(skip_filters), + level_(level) {} - Iterator* NewSecondaryIterator(const Slice& meta_handle) override { + InternalIterator* NewSecondaryIterator(const Slice& meta_handle) override { if (meta_handle.size() != sizeof(FileDescriptor)) { - return NewErrorIterator( + return NewErrorInternalIterator( Status::Corruption("FileReader invoked with unexpected value")); } else { const FileDescriptor* fd = @@ -498,7 +514,7 @@ class LevelFileIteratorState : public TwoLevelIteratorState { return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *fd, nullptr /* don't need reference to table*/, file_read_hist_, - for_compaction_); + for_compaction_, nullptr /* arena */, skip_filters_, level_); } } @@ -513,6 +529,8 @@ class LevelFileIteratorState : public TwoLevelIteratorState { const InternalKeyComparator& icomparator_; HistogramImpl* file_read_hist_; bool for_compaction_; + bool skip_filters_; + int level_; }; // A wrapper of version builder which references the current version in @@ -523,7 +541,7 @@ class BaseReferencedVersionBuilder { explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd) : version_builder_(new VersionBuilder( cfd->current()->version_set()->env_options(), cfd->table_cache(), - cfd->current()->storage_info())), + cfd->current()->storage_info(), cfd->ioptions()->info_log)), version_(cfd->current()) { version_->Ref(); } @@ -541,7 +559,7 @@ class BaseReferencedVersionBuilder { Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, - const std::string* fname) { + const std::string* fname) const { auto table_cache = cfd_->table_cache(); auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( @@ -580,8 +598,7 @@ Status Version::GetTableProperties(std::shared_ptr* tp, new RandomAccessFileReader(std::move(file))); s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), - Footer::kInvalidTableMagicNumber /* table's magic number */, vset_->env_, - ioptions->info_log, &raw_table_properties); + Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions, &raw_table_properties); if (!s.ok()) { return s; } @@ -623,6 +640,38 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, return Status::OK(); } +Status Version::GetPropertiesOfTablesInRange( + const Range* range, std::size_t n, TablePropertiesCollection* props) const { + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + for (decltype(n) i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + std::vector files; + storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, + false); + for (const auto& file_meta : files) { + auto fname = + TableFileName(vset_->db_options_->db_paths, + file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); + if (props->count(fname) == 0) { + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(&table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + } + } + } + + return Status::OK(); +} + Status Version::GetAggregatedTableProperties( std::shared_ptr* tp, int level) { TablePropertiesCollection props; @@ -705,29 +754,47 @@ uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { // (2) keys are directly overwritten // (3) deletion on non-existing keys // (4) low number of samples - if (num_samples_ == 0) { + if (current_num_samples_ == 0) { return 0; } - if (accumulated_num_non_deletions_ <= accumulated_num_deletions_) { + if (current_num_non_deletions_ <= current_num_deletions_) { return 0; } - uint64_t est = accumulated_num_non_deletions_ - accumulated_num_deletions_; + uint64_t est = current_num_non_deletions_ - current_num_deletions_; uint64_t file_count = 0; for (int level = 0; level < num_levels_; ++level) { file_count += files_[level].size(); } - if (num_samples_ < file_count) { + if (current_num_samples_ < file_count) { // casting to avoid overflowing - return (est * static_cast(file_count) / num_samples_); + return + static_cast( + (est * static_cast(file_count) / current_num_samples_) + ); } else { return est; } } +double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( + int level) const { + assert(level < num_levels_); + uint64_t sum_file_size_bytes = 0; + uint64_t sum_data_size_bytes = 0; + for (auto* file_meta : files_[level]) { + sum_file_size_bytes += file_meta->fd.GetFileSize(); + sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size; + } + if (sum_file_size_bytes == 0) { + return -1.0; + } + return static_cast(sum_data_size_bytes) / sum_file_size_bytes; +} + void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder) { @@ -745,7 +812,8 @@ void Version::AddIterators(const ReadOptions& read_options, const auto& file = storage_info_.LevelFilesBrief(0).files[i]; merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr, - cfd_->internal_stats()->GetFileReadHist(0), false, arena)); + cfd_->internal_stats()->GetFileReadHist(0), false, arena, + false /* skip_filters */, 0 /* level */)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -759,7 +827,8 @@ void Version::AddIterators(const ReadOptions& read_options, cfd_->internal_comparator(), cfd_->internal_stats()->GetFileReadHist(level), false /* for_compaction */, - cfd_->ioptions()->prefix_extractor != nullptr); + cfd_->ioptions()->prefix_extractor != nullptr, + IsFilterSkipped(level), level); mem = arena->AllocateAligned(sizeof(LevelFileNumIterator)); auto* first_level_iter = new (mem) LevelFileNumIterator( cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level)); @@ -793,7 +862,9 @@ VersionStorageInfo::VersionStorageInfo( accumulated_raw_value_size_(0), accumulated_num_non_deletions_(0), accumulated_num_deletions_(0), - num_samples_(0), + current_num_non_deletions_(0), + current_num_deletions_(0), + current_num_samples_(0), estimated_compaction_needed_bytes_(0), finalized_(false) { if (ref_vstorage != nullptr) { @@ -803,7 +874,9 @@ VersionStorageInfo::VersionStorageInfo( accumulated_num_non_deletions_ = ref_vstorage->accumulated_num_non_deletions_; accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; - num_samples_ = ref_vstorage->num_samples_; + current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_; + current_num_deletions_ = ref_vstorage->current_num_deletions_; + current_num_samples_ = ref_vstorage->current_num_samples_; } } @@ -831,21 +904,31 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, refs_(0), version_number_(version_number) {} -void Version::Get(const ReadOptions& read_options, - const LookupKey& k, - std::string* value, - Status* status, - MergeContext* merge_context, - bool* value_found) { +void Version::Get(const ReadOptions& read_options, const LookupKey& k, + std::string* value, Status* status, + MergeContext* merge_context, bool* value_found, + bool* key_exists, SequenceNumber* seq) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); assert(status->ok() || status->IsMergeInProgress()); + if (key_exists != nullptr) { + // will falsify below if not found + *key_exists = true; + } + + PinnedIteratorsManager pinned_iters_mgr; GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, - value, value_found, merge_context, this->env_); + value, value_found, merge_context, this->env_, seq, + merge_operator_ ? &pinned_iters_mgr : nullptr); + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr.StartPinning(); + } FilePicker fp( storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_, @@ -855,7 +938,10 @@ void Version::Get(const ReadOptions& read_options, while (f != nullptr) { *status = table_cache_->Get( read_options, *internal_comparator(), f->fd, ikey, &get_context, - cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel())); + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetCurrentLevel()); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; @@ -895,20 +981,25 @@ void Version::Get(const ReadOptions& read_options, } // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; - if (merge_operator_->FullMerge(user_key, nullptr, - merge_context->GetOperands(), value, - info_log_)) { - *status = Status::OK(); - } else { - RecordTick(db_statistics_, NUMBER_MERGE_FAILURES); - *status = Status::Corruption("could not perform end-of-key merge for ", - user_key); - } + *status = MergeHelper::TimedFullMerge(merge_operator_, user_key, nullptr, + merge_context->GetOperands(), value, + info_log_, db_statistics_, env_); } else { + if (key_exists != nullptr) { + *key_exists = false; + } *status = Status::NotFound(); // Use an empty error message for speed } } +bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { + // Reaching the bottom level implies misses at all upper levels, so we'll + // skip checking the filters when we predict a hit. + return cfd_->ioptions()->optimize_filters_for_hits && + (level > 0 || is_file_last_in_level) && + level == storage_info_.num_non_empty_levels() - 1; +} + void VersionStorageInfo::GenerateLevelFilesBrief() { level_files_brief_.resize(num_non_empty_levels_); for (int level = 0; level < num_non_empty_levels_; level++) { @@ -960,7 +1051,20 @@ void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { accumulated_num_non_deletions_ += file_meta->num_entries - file_meta->num_deletions; accumulated_num_deletions_ += file_meta->num_deletions; - num_samples_++; + + current_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ += file_meta->num_deletions; + current_num_samples_++; +} + +void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { + if (file_meta->init_stats_from_file) { + current_num_non_deletions_ -= + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ -= file_meta->num_deletions; + current_num_samples_--; + } } void Version::UpdateAccumulatedStats(bool update_stats) { @@ -985,6 +1089,12 @@ void Version::UpdateAccumulatedStats(bool update_stats) { if (MaybeInitializeFileMetaData(file_meta)) { // each FileMeta will be initialized only once. storage_info_.UpdateAccumulatedStats(file_meta); + // when option "max_open_files" is -1, all the file metadata has + // already been read, so MaybeInitializeFileMetaData() won't incur + // any I/O cost. + if (vset_->db_options_->max_open_files == -1) { + continue; + } if (++init_count >= kMaxInitCount) { break; } @@ -1051,6 +1161,7 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded( const MutableCFOptions& mutable_cf_options) { // Only implemented for level-based compaction if (compaction_style_ != kCompactionStyleLevel) { + estimated_compaction_needed_bytes_ = 0; return; } @@ -1064,7 +1175,7 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded( // We keep doing it to Level 2, 3, etc, until the last level and return the // accumulated bytes. - size_t bytes_compact_to_next_level = 0; + uint64_t bytes_compact_to_next_level = 0; // Level 0 bool level0_compact_triggered = false; if (static_cast(files_[0].size()) > @@ -1079,10 +1190,23 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded( } // Level 1 and up. + uint64_t bytes_next_level = 0; for (int level = base_level(); level <= MaxInputLevel(); level++) { - size_t level_size = 0; - for (auto* f : files_[level]) { - level_size += f->fd.GetFileSize(); + uint64_t level_size = 0; + if (bytes_next_level > 0) { +#ifndef NDEBUG + uint64_t level_size2 = 0; + for (auto* f : files_[level]) { + level_size2 += f->fd.GetFileSize(); + } + assert(level_size2 == bytes_next_level); +#endif + level_size = bytes_next_level; + bytes_next_level = 0; + } else { + for (auto* f : files_[level]) { + level_size += f->fd.GetFileSize(); + } } if (level == base_level() && level0_compact_triggered) { // Add base level size to compaction if level0 compaction triggered. @@ -1091,24 +1215,32 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded( // Add size added by previous compaction level_size += bytes_compact_to_next_level; bytes_compact_to_next_level = 0; - size_t level_target = MaxBytesForLevel(level); + uint64_t level_target = MaxBytesForLevel(level); if (level_size > level_target) { bytes_compact_to_next_level = level_size - level_target; - // Simplify to assume the actual compaction fan-out ratio is always - // mutable_cf_options.max_bytes_for_level_multiplier. - estimated_compaction_needed_bytes_ += - bytes_compact_to_next_level * - (1 + mutable_cf_options.max_bytes_for_level_multiplier); + // Estimate the actual compaction fan-out ratio as size ratio between + // the two levels. + + assert(bytes_next_level == 0); + if (level + 1 < num_levels_) { + for (auto* f : files_[level + 1]) { + bytes_next_level += f->fd.GetFileSize(); + } + } + if (bytes_next_level > 0) { + assert(level_size > 0); + estimated_compaction_needed_bytes_ += static_cast( + static_cast(bytes_compact_to_next_level) * + (static_cast(bytes_next_level) / + static_cast(level_size) + + 1)); + } } } } void VersionStorageInfo::ComputeCompactionScore( - const MutableCFOptions& mutable_cf_options, - const CompactionOptionsFIFO& compaction_options_fifo) { - double max_score = 0; - int max_score_level = 0; - + const MutableCFOptions& mutable_cf_options) { for (int level = 0; level <= MaxInputLevel(); level++) { double score; if (level == 0) { @@ -1144,7 +1276,7 @@ void VersionStorageInfo::ComputeCompactionScore( if (compaction_style_ == kCompactionStyleFIFO) { score = static_cast(total_size) / - compaction_options_fifo.max_table_files_size; + mutable_cf_options.compaction_options_fifo.max_table_files_size; } else { score = static_cast(num_sorted_runs) / mutable_cf_options.level0_file_num_compaction_trigger; @@ -1159,19 +1291,11 @@ void VersionStorageInfo::ComputeCompactionScore( } score = static_cast(level_bytes_no_compacting) / MaxBytesForLevel(level); - if (max_score < score) { - max_score = score; - max_score_level = level; - } } compaction_level_[level] = level; compaction_score_[level] = score; } - // update the max compaction score in levels 1 to n-1 - max_compaction_score_ = max_score; - max_compaction_score_level_ = max_score_level; - // sort all the levels based on their score. Higher scores get listed // first. Use bubble sort because the number of entries are small. for (int i = 0; i < num_levels() - 2; i++) { @@ -1217,7 +1341,7 @@ namespace { // used to sort files by size struct Fsize { - int index; + size_t index; FileMetaData* file; }; @@ -1229,13 +1353,27 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { } } // anonymous namespace -void VersionStorageInfo::AddFile(int level, FileMetaData* f) { +void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { auto* level_files = &files_[level]; // Must not overlap - assert(level <= 0 || level_files->empty() || - internal_comparator_->Compare( - (*level_files)[level_files->size() - 1]->largest, f->smallest) < - 0); +#ifndef NDEBUG + if (level > 0 && !level_files->empty() && + internal_comparator_->Compare( + (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) { + auto* f2 = (*level_files)[level_files->size() - 1]; + if (info_log != nullptr) { + Error(info_log, "Adding new file %" PRIu64 + " range (%s, %s) to level %d but overlapping " + "with existing file %" PRIu64 " %s %s", + f->fd.GetNumber(), f->smallest.DebugString(true).c_str(), + f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(), + f2->smallest.DebugString(true).c_str(), + f2->largest.DebugString(true).c_str()); + LogFlush(info_log); + } + assert(false); + } +#endif f->refs++; level_files->push_back(f); } @@ -1296,6 +1434,47 @@ void VersionStorageInfo::UpdateNumNonEmptyLevels() { } } +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + assert(file->fd.file_size != 0); + file_to_order[file->fd.GetNumber()] = + overlapping_bytes * 1024u / file->fd.file_size; + } + + std::sort(temp->begin(), temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} +} // namespace + void VersionStorageInfo::UpdateFilesByCompactionPri( const MutableCFOptions& mutable_cf_options) { if (compaction_style_ == kCompactionStyleFIFO || @@ -1311,7 +1490,7 @@ void VersionStorageInfo::UpdateFilesByCompactionPri( // populate a temp vector for sorting based on size std::vector temp(files.size()); - for (unsigned int i = 0; i < files.size(); i++) { + for (size_t i = 0; i < files.size(); i++) { temp[i].index = i; temp[i].file = files[i]; } @@ -1322,24 +1501,34 @@ void VersionStorageInfo::UpdateFilesByCompactionPri( num = temp.size(); } switch (mutable_cf_options.compaction_pri) { - case kCompactionPriByCompensatedSize: + case kByCompensatedSize: std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), CompareCompensatedSizeDescending); break; - case kCompactionPriByLargestSeq: + case kOldestLargestSeqFirst: std::sort(temp.begin(), temp.end(), [this](const Fsize& f1, const Fsize& f2) -> bool { return f1.file->largest_seqno < f2.file->largest_seqno; }); break; + case kOldestSmallestSeqFirst: + std::sort(temp.begin(), temp.end(), + [this](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->smallest_seqno < f2.file->smallest_seqno; + }); + break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], &temp); + break; default: assert(false); } assert(temp.size() == files.size()); // initialize files_by_compaction_pri_ - for (unsigned int i = 0; i < temp.size(); i++) { - files_by_compaction_pri.push_back(temp[i].index); + for (size_t i = 0; i < temp.size(); i++) { + files_by_compaction_pri.push_back(static_cast(temp[i].index)); } next_file_to_compact_by_size_[level] = 0; assert(files_[level].size() == files_by_compaction_pri_[level].size()); @@ -1357,11 +1546,11 @@ void VersionStorageInfo::GenerateLevel0NonOverlapping() { std::vector level0_sorted_file( level_files_brief_[0].files, level_files_brief_[0].files + level_files_brief_[0].num_files); - sort(level0_sorted_file.begin(), level0_sorted_file.end(), - [this](const FdWithKeyRange & f1, const FdWithKeyRange & f2)->bool { - return (internal_comparator_->Compare(f1.smallest_key, f2.smallest_key) < - 0); - }); + std::sort(level0_sorted_file.begin(), level0_sorted_file.end(), + [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool { + return (internal_comparator_->Compare(f1.smallest_key, + f2.smallest_key) < 0); + }); for (size_t i = 1; i < level0_sorted_file.size(); ++i) { FdWithKeyRange& f = level0_sorted_file[i]; @@ -1405,7 +1594,8 @@ bool VersionStorageInfo::OverlapInLevel(int level, // The file_index returns a pointer to any file in an overlapping range. void VersionStorageInfo::GetOverlappingInputs( int level, const InternalKey* begin, const InternalKey* end, - std::vector* inputs, int hint_index, int* file_index) { + std::vector* inputs, int hint_index, int* file_index, + bool expand_range) const { if (level >= num_non_empty_levels_) { // this level is empty, no overlapping inputs return; @@ -1438,7 +1628,7 @@ void VersionStorageInfo::GetOverlappingInputs( // "f" is completely after specified range; skip it } else { inputs->push_back(files_[level][i-1]); - if (level == 0) { + if (level == 0 && expand_range) { // Level-0 files may overlap each other. So check if the newly // added file has expanded the range. If so, restart search. if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { @@ -1464,7 +1654,7 @@ void VersionStorageInfo::GetOverlappingInputs( // forwards to find all overlapping files. void VersionStorageInfo::GetOverlappingInputsBinarySearch( int level, const Slice& user_begin, const Slice& user_end, - std::vector* inputs, int hint_index, int* file_index) { + std::vector* inputs, int hint_index, int* file_index) const { assert(level > 0); int min = 0; int mid = 0; @@ -1512,8 +1702,7 @@ void VersionStorageInfo::GetOverlappingInputsBinarySearch( // Use FileLevel in searching, make it faster void VersionStorageInfo::ExtendOverlappingInputs( int level, const Slice& user_begin, const Slice& user_end, - std::vector* inputs, unsigned int midIndex) { - + std::vector* inputs, unsigned int midIndex) const { const Comparator* user_cmp = user_comparator_; const FdWithKeyRange* files = level_files_brief_[level].files; #ifndef NDEBUG @@ -1895,20 +2084,20 @@ struct VersionSet::ManifestWriter { bool done; InstrumentedCondVar cv; ColumnFamilyData* cfd; - VersionEdit* edit; + const autovector& edit_list; explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd, - VersionEdit* e) - : done(false), cv(mu), cfd(_cfd), edit(e) {} + const autovector& e) + : done(false), cv(mu), cfd(_cfd), edit_list(e) {} }; VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, const EnvOptions& storage_options, Cache* table_cache, - WriteBuffer* write_buffer, + WriteBufferManager* write_buffer_manager, WriteController* write_controller) - : column_family_set_(new ColumnFamilySet( - dbname, db_options, storage_options, table_cache, - write_buffer, write_controller)), + : column_family_set_( + new ColumnFamilySet(dbname, db_options, storage_options, table_cache, + write_buffer_manager, write_controller)), env_(db_options->env), dbname_(dbname), db_options_(db_options), @@ -1922,9 +2111,16 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, env_options_(storage_options), env_options_compactions_(env_options_) {} +void CloseTables(void* ptr, size_t) { + TableReader* table_reader = reinterpret_cast(ptr); + table_reader->Close(); +} + VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on // VersionSet + column_family_set_->get_table_cache()->ApplyToAllCacheEntries(&CloseTables, + false); column_family_set_.reset(); for (auto file : obsolete_files_) { delete file; @@ -1936,8 +2132,7 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Version* v) { // compute new compaction score v->storage_info()->ComputeCompactionScore( - *column_family_data->GetLatestMutableCFOptions(), - column_family_data->ioptions()->compaction_options_fifo); + *column_family_data->GetLatestMutableCFOptions()); // Mark v finalized v->storage_info_.SetFinalized(); @@ -1962,20 +2157,34 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, - VersionEdit* edit, InstrumentedMutex* mu, - Directory* db_directory, bool new_descriptor_log, + const autovector& edit_list, + InstrumentedMutex* mu, Directory* db_directory, + bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options) { mu->AssertHeld(); + // num of edits + auto num_edits = edit_list.size(); + if (num_edits == 0) { + return Status::OK(); + } else if (num_edits > 1) { +#ifndef NDEBUG + // no group commits for column family add or drop + for (auto& edit : edit_list) { + assert(!edit->IsColumnFamilyManipulation()); + } +#endif + } // column_family_data can be nullptr only if this is column_family_add. // in that case, we also need to specify ColumnFamilyOptions if (column_family_data == nullptr) { - assert(edit->is_column_family_add_); + assert(num_edits == 1); + assert(edit_list[0]->is_column_family_add_); assert(new_cf_options != nullptr); } // queue our request - ManifestWriter w(mu, column_family_data, edit); + ManifestWriter w(mu, column_family_data, edit_list); manifest_writers_.push_back(&w); while (!w.done && &w != manifest_writers_.front()) { w.cv.Wait(); @@ -1995,7 +2204,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, return Status::ShutdownInProgress(); } - std::vector batch_edits; + autovector batch_edits; Version* v = nullptr; std::unique_ptr builder_guard(nullptr); @@ -2003,24 +2212,26 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, ManifestWriter* last_writer = &w; assert(!manifest_writers_.empty()); assert(manifest_writers_.front() == &w); - if (edit->IsColumnFamilyManipulation()) { + if (w.edit_list.front()->IsColumnFamilyManipulation()) { // no group commits for column family add or drop - LogAndApplyCFHelper(edit); - batch_edits.push_back(edit); + LogAndApplyCFHelper(w.edit_list.front()); + batch_edits.push_back(w.edit_list.front()); } else { v = new Version(column_family_data, this, current_version_number_++); builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data)); auto* builder = builder_guard->version_builder(); for (const auto& writer : manifest_writers_) { - if (writer->edit->IsColumnFamilyManipulation() || + if (writer->edit_list.front()->IsColumnFamilyManipulation() || writer->cfd->GetID() != column_family_data->GetID()) { // no group commits for column family add or drop // also, group commits across column families are not supported break; } last_writer = writer; - LogAndApplyHelper(column_family_data, builder, v, last_writer->edit, mu); - batch_edits.push_back(last_writer->edit); + for (const auto& edit : writer->edit_list) { + LogAndApplyHelper(column_family_data, builder, v, edit, mu); + batch_edits.push_back(edit); + } } builder->SaveTo(v->storage_info()); } @@ -2043,7 +2254,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (new_descriptor_log) { // if we're writing out new snapshot make sure to persist max column family if (column_family_set_->GetMaxColumnFamily() > 0) { - edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + w.edit_list.front()->SetMaxColumnFamily( + column_family_set_->GetMaxColumnFamily()); } } @@ -2054,12 +2266,14 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, mu->Unlock(); TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest"); - if (!edit->IsColumnFamilyManipulation() && + if (!w.edit_list.front()->IsColumnFamilyManipulation() && db_options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. builder_guard->version_builder()->LoadTableHandlers( - column_family_data->internal_stats()); + column_family_data->internal_stats(), + column_family_data->ioptions()->optimize_filters_for_hits, + true /* prefetch_index_and_filter_in_cache */); } // This is fine because everything inside of this block is serialized -- @@ -2070,8 +2284,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); unique_ptr descriptor_file; EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_); - s = env_->NewWritableFile( - DescriptorFileName(dbname_, pending_manifest_file_number_), + s = NewWritableFile( + env_, DescriptorFileName(dbname_, pending_manifest_file_number_), &descriptor_file, opt_env_opts); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( @@ -2079,12 +2293,13 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, unique_ptr file_writer( new WritableFileWriter(std::move(descriptor_file), opt_env_opts)); - descriptor_log_.reset(new log::Writer(std::move(file_writer))); + descriptor_log_.reset( + new log::Writer(std::move(file_writer), 0, false)); s = WriteSnapshot(descriptor_log_.get()); } } - if (!edit->IsColumnFamilyManipulation()) { + if (!w.edit_list.front()->IsColumnFamilyManipulation()) { // This is cpu-heavy operations, which should be called outside mutex. v->PrepareApply(mutable_cf_options, true); } @@ -2098,6 +2313,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, "Unable to Encode VersionEdit:" + e->DebugString(true)); break; } + TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord", + rocksdb_kill_odds * REDUCE_ODDS2); s = descriptor_log_->AddRecord(record); if (!s.ok()) { break; @@ -2109,27 +2326,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); - bool all_records_in = true; - for (auto& e : batch_edits) { - std::string record; - if (!e->EncodeTo(&record)) { - s = Status::Corruption( - "Unable to Encode VersionEdit:" + e->DebugString(true)); - all_records_in = false; - break; - } - if (!ManifestContains(pending_manifest_file_number_, record)) { - all_records_in = false; - break; - } - } - if (all_records_in) { - Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log, - "MANIFEST contains log record despite error; advancing to new " - "version to prevent mismatch between in-memory and logged state" - " If paranoid is set, then the db is now in readonly mode."); - s = Status::OK(); - } } } @@ -2138,15 +2334,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (s.ok() && new_descriptor_log) { s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, db_options_->disableDataSync ? nullptr : db_directory); - if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { - // delete old manifest file - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", - manifest_file_number_, pending_manifest_file_number_); - // we don't care about an error here, PurgeObsoleteFiles will take care - // of it later - env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_)); - } } if (s.ok()) { @@ -2154,23 +2341,32 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, new_manifest_file_size = descriptor_log_->file()->GetFileSize(); } - if (edit->is_column_family_drop_) { + if (w.edit_list.front()->is_column_family_drop_) { + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); } LogFlush(db_options_->info_log); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone"); mu->Lock(); } + // Append the old mainfest file to the obsolete_manifests_ list to be deleted + // by PurgeObsoleteFiles later. + if (s.ok() && new_descriptor_log) { + obsolete_manifests_.emplace_back( + DescriptorFileName("", manifest_file_number_)); + } + // Install the new version if (s.ok()) { - if (edit->is_column_family_add_) { + if (w.edit_list.front()->is_column_family_add_) { // no group commit on column family add assert(batch_edits.size() == 1); assert(new_cf_options != nullptr); - CreateColumnFamily(*new_cf_options, edit); - } else if (edit->is_column_family_drop_) { + CreateColumnFamily(*new_cf_options, w.edit_list.front()); + } else if (w.edit_list.front()->is_column_family_drop_) { assert(batch_edits.size() == 1); column_family_data->SetDropped(); if (column_family_data->Unref()) { @@ -2193,12 +2389,16 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, manifest_file_number_ = pending_manifest_file_number_; manifest_file_size_ = new_manifest_file_size; - prev_log_number_ = edit->prev_log_number_; + prev_log_number_ = w.edit_list.front()->prev_log_number_; } else { + std::string version_edits; + for (auto& e : batch_edits) { + version_edits = version_edits + "\n" + e->DebugString(true); + } Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log, - "Error in committing version %lu to [%s]", - (unsigned long)v->GetVersionNumber(), - column_family_data->GetName().c_str()); + "[%s] Error in committing version edit to MANIFEST: %s", + column_family_data ? column_family_data->GetName().c_str() : "", + version_edits.c_str()); delete v; if (new_descriptor_log) { Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, @@ -2341,8 +2541,8 @@ Status VersionSet::Recover( { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(manifest_file_reader), &reporter, - true /*checksum*/, 0 /*initial_offset*/); + log::Reader reader(NULL, std::move(manifest_file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2515,8 +2715,9 @@ Status VersionSet::Recover( if (db_options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. - builder->LoadTableHandlers(cfd->internal_stats(), - db_options_->max_file_opening_threads); + builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */); } Version* v = new Version(cfd, this, current_version_number_++); @@ -2554,7 +2755,7 @@ Status VersionSet::Recover( } } - for (auto builder : builders) { + for (auto& builder : builders) { delete builder.second; } @@ -2594,8 +2795,8 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(NULL, std::move(file_reader), &reporter, true /*checksum*/, + 0 /*initial_offset*/, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2647,7 +2848,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, options->table_cache_numshardbits)); WriteController wc(options->delayed_write_rate); - WriteBuffer wb(options->db_write_buffer_size); + WriteBufferManager wb(options->db_write_buffer_size); VersionSet versions(dbname, options, env_options, tc.get(), &wb, &wc); Status status; @@ -2752,8 +2953,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(NULL, std::move(file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2982,44 +3183,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { return Status::OK(); } -// Opens the mainfest file and reads all records -// till it finds the record we are looking for. -bool VersionSet::ManifestContains(uint64_t manifest_file_num, - const std::string& record) const { - std::string fname = DescriptorFileName(dbname_, manifest_file_num); - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: checking %s\n", fname.c_str()); - - unique_ptr file_reader; - Status s; - { - unique_ptr file; - s = env_->NewSequentialFile(fname, &file, env_options_); - if (!s.ok()) { - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: %s\n", s.ToString().c_str()); - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: is unable to reopen the manifest file %s", - fname.c_str()); - return false; - } - file_reader.reset(new SequentialFileReader(std::move(file))); - } - log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0); - Slice r; - std::string scratch; - bool result = false; - while (reader.ReadRecord(&r, &scratch)) { - if (r == Slice(record)) { - result = true; - break; - } - } - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: result = %d\n", result ? 1 : 0); - return result; -} - // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this // function is called repeatedly with consecutive pairs of slices. For example // if the slice list is [a, b, c, d] this function is called with arguments @@ -3119,7 +3282,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, // "key" falls in the range for this table. Add the // approximate offset of "key" within the table. TableReader* table_reader_ptr; - Iterator* iter = v->cfd_->table_cache()->NewIterator( + InternalIterator* iter = v->cfd_->table_cache()->NewIterator( ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd, &table_reader_ptr); if (table_reader_ptr != nullptr) { @@ -3166,7 +3329,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { } } -Iterator* VersionSet::MakeInputIterator(Compaction* c) { +InternalIterator* VersionSet::MakeInputIterator(const Compaction* c) { auto cfd = c->column_family_data(); ReadOptions read_options; read_options.verify_checksums = @@ -3182,7 +3345,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + c->num_input_levels() - 1 : c->num_input_levels()); - Iterator** list = new Iterator* [space]; + InternalIterator** list = new InternalIterator* [space]; size_t num = 0; for (size_t which = 0; which < c->num_input_levels(); which++) { if (c->input_levels(which)->num_files != 0) { @@ -3193,7 +3356,8 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { read_options, env_options_compactions_, cfd->internal_comparator(), flevel->files[i].fd, nullptr, nullptr, /* no per level latency histogram*/ - true /* for compaction */); + true /* for_compaction */, nullptr /* arena */, + false /* skip_filters */, (int)which /* level */); } } else { // Create concatenating iterator for the files from this level @@ -3202,14 +3366,15 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { cfd->table_cache(), read_options, env_options_, cfd->internal_comparator(), nullptr /* no per level latency histogram */, - true /* for_compaction */, false /* prefix enabled */), + true /* for_compaction */, false /* prefix enabled */, + false /* skip_filters */, (int)which /* level */), new LevelFileNumIterator(cfd->internal_comparator(), c->input_levels(which))); } } } assert(num <= space); - Iterator* result = + InternalIterator* result = NewMergingIterator(&c->column_family_data()->internal_comparator(), list, static_cast(num)); delete[] list; @@ -3248,7 +3413,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { for (size_t i = 0; i < c->num_input_files(input); ++i) { uint64_t number = c->input(input, i)->fd.GetNumber(); bool found = false; - for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) { + for (size_t j = 0; j < vstorage->files_[level].size(); j++) { FileMetaData* f = vstorage->files_[level][j]; if (f->fd.GetNumber() == number) { found = true; @@ -3315,7 +3480,10 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { } void VersionSet::GetObsoleteFiles(std::vector* files, + std::vector* manifest_filenames, uint64_t min_pending_output) { + assert(manifest_filenames->empty()); + obsolete_manifests_.swap(*manifest_filenames); std::vector pending_files; for (auto f : obsolete_files_) { if (f->fd.GetNumber() < min_pending_output) { diff --git a/external/rocksdb/db/version_set.h b/external/rocksdb/db/version_set.h index 3964600953..8fe7c47ce2 100644 --- a/external/rocksdb/db/version_set.h +++ b/external/rocksdb/db/version_set.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -48,15 +49,14 @@ class Writer; } class Compaction; -class Iterator; +class InternalIterator; class LogBuffer; class LookupKey; class MemTable; class Version; class VersionSet; -class WriteBuffer; +class WriteBufferManager; class MergeContext; -class ColumnFamilyData; class ColumnFamilySet; class TableCache; class MergeIteratorBuilder; @@ -97,7 +97,7 @@ class VersionStorageInfo { void Reserve(int level, size_t size) { files_[level].reserve(size); } - void AddFile(int level, FileMetaData* f); + void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr); void SetFinalized(); @@ -111,15 +111,16 @@ class VersionStorageInfo { // Update the accumulated stats from a file-meta. void UpdateAccumulatedStats(FileMetaData* file_meta); + // Decrease the current stat form a to-be-delected file-meta + void RemoveCurrentStats(FileMetaData* file_meta); + void ComputeCompensatedSizes(); // Updates internal structures that keep track of compaction scores // We use compaction scores to figure out which compaction to do next // REQUIRES: db_mutex held!! // TODO find a better way to pass compaction_options_fifo. - void ComputeCompactionScore( - const MutableCFOptions& mutable_cf_options, - const CompactionOptionsFIFO& compaction_options_fifo); + void ComputeCompactionScore(const MutableCFOptions& mutable_cf_options); // Estimate est_comp_needed_bytes_ void EstimateCompactionBytesNeeded( @@ -143,12 +144,6 @@ class VersionStorageInfo { int MaxInputLevel() const; - // Returns the maxmimum compaction score for levels 1 to max - double max_compaction_score() const { return max_compaction_score_; } - - // See field declaration - int max_compaction_score_level() const { return max_compaction_score_level_; } - // Return level number that has idx'th highest score int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } @@ -159,23 +154,26 @@ class VersionStorageInfo { int level, const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys std::vector* inputs, - int hint_index = -1, // index of overlap file - int* file_index = nullptr); // return index of overlap file + int hint_index = -1, // index of overlap file + int* file_index = nullptr, // return index of overlap file + bool expand_range = true) // if set, returns files which overlap the + const; // range and overlap each other. If false, + // then just files intersecting the range void GetOverlappingInputsBinarySearch( int level, const Slice& begin, // nullptr means before all keys const Slice& end, // nullptr means after all keys std::vector* inputs, - int hint_index, // index of overlap file - int* file_index); // return index of overlap file + int hint_index, // index of overlap file + int* file_index) const; // return index of overlap file void ExtendOverlappingInputs( int level, const Slice& begin, // nullptr means before all keys const Slice& end, // nullptr means after all keys std::vector* inputs, - unsigned int index); // start extending from this index + unsigned int index) const; // start extending from this index // Returns true iff some file in the specified level overlaps // some part of [*smallest_user_key,*largest_user_key]. @@ -300,6 +298,8 @@ class VersionStorageInfo { uint64_t GetEstimatedActiveKeys() const; + double GetEstimatedCompressionRatioAtLevel(int level) const; + // re-initializes the index that is used to offset into // files_by_compaction_pri_ // to find the next compaction candidate file. @@ -325,6 +325,10 @@ class VersionStorageInfo { return estimated_compaction_needed_bytes_; } + void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { + estimated_compaction_needed_bytes_ = v; + } + private: const InternalKeyComparator* internal_comparator_; const Comparator* user_comparator_; @@ -381,8 +385,6 @@ class VersionStorageInfo { // These are used to pick the best compaction level std::vector compaction_score_; std::vector compaction_level_; - double max_compaction_score_ = 0.0; // max score in l1 to ln-1 - int max_compaction_score_level_ = 0; // level on which max score occurs int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop // for number of L0 files. @@ -397,8 +399,12 @@ class VersionStorageInfo { uint64_t accumulated_num_non_deletions_; // total number of deletion entries uint64_t accumulated_num_deletions_; - // the number of samples - uint64_t num_samples_; + // current number of non_deletion entries + uint64_t current_num_non_deletions_; + // current number of delection entries + uint64_t current_num_deletions_; + // current number of file samples + uint64_t current_num_samples_; // Estimated bytes needed to be compacted until all levels' size is down to // target sizes. uint64_t estimated_compaction_needed_bytes_; @@ -422,11 +428,24 @@ class Version { // Lookup the value for key. If found, store it in *val and // return OK. Else return a non-OK status. - // Uses *operands to store merge_operator operations to apply later + // Uses *operands to store merge_operator operations to apply later. + // + // If the ReadOptions.read_tier is set to do a read-only fetch, then + // *value_found will be set to false if it cannot be determined whether + // this value exists without doing IO. + // + // If the key is Deleted, *status will be set to NotFound and + // *key_exists will be set to true. + // If no key was found, *status will be set to NotFound and + // *key_exists will be set to false. + // If seq is non-null, *seq will be set to the sequence number found + // for the key if a key was found. + // // REQUIRES: lock is not held void Get(const ReadOptions&, const LookupKey& key, std::string* val, Status* status, MergeContext* merge_context, - bool* value_found = nullptr); + bool* value_found = nullptr, bool* key_exists = nullptr, + SequenceNumber* seq = nullptr); // Loads some stats information from files. Call without mutex held. It needs // to be called before applying the version to the version set. @@ -456,15 +475,16 @@ class Version { // file-name conversion. Status GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, - const std::string* fname = nullptr); + const std::string* fname = nullptr) const; // REQUIRES: lock is held // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the // tables' propertis, represented as shared_ptr. Status GetPropertiesOfAllTables(TablePropertiesCollection* props); - Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, + TablePropertiesCollection* props) const; // REQUIRES: lock is held // On success, "tp" will contains the aggregated table property amoug @@ -502,9 +522,16 @@ class Version { return storage_info_.user_comparator_; } - bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter, + bool PrefixMayMatch(const ReadOptions& read_options, + InternalIterator* level_iter, const Slice& internal_prefix) const; + // Returns true if the filter blocks in the specified level will not be + // checked during read operations. In certain cases (trivial move or preload), + // the filter block may already be cached, but we still do not access it such + // that it eventually expires from the cache. + bool IsFilterSkipped(int level, bool is_file_last_in_level = false); + // The helper function of UpdateAccumulatedStats, which may fill the missing // fields of file_mata from its associated TableProperties. // Returns true if it does initialize FileMetaData. @@ -548,7 +575,8 @@ class VersionSet { public: VersionSet(const std::string& dbname, const DBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, - WriteBuffer* write_buffer, WriteController* write_controller); + WriteBufferManager* write_buffer_manager, + WriteController* write_controller); ~VersionSet(); // Apply *edit to the current version to form a new descriptor that @@ -562,6 +590,19 @@ class VersionSet { const MutableCFOptions& mutable_cf_options, VersionEdit* edit, InstrumentedMutex* mu, Directory* db_directory = nullptr, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + autovector edit_list; + edit_list.push_back(edit); + return LogAndApply(column_family_data, mutable_cf_options, edit_list, mu, + db_directory, new_descriptor_log, column_family_options); + } + // The batch version. If edit_list.size() > 1, caller must ensure that + // no edit in the list column family add or drop + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, + const autovector& edit_list, InstrumentedMutex* mu, + Directory* db_directory = nullptr, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr); // Recover the last saved descriptor from persistent storage. @@ -599,6 +640,8 @@ class VersionSet { // Return the current manifest file number uint64_t manifest_file_number() const { return manifest_file_number_; } + uint64_t options_file_number() const { return options_file_number_; } + uint64_t pending_manifest_file_number() const { return pending_manifest_file_number_; } @@ -643,7 +686,7 @@ class VersionSet { // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. - Iterator* MakeInputIterator(Compaction* c); + InternalIterator* MakeInputIterator(const Compaction* c); // Add all files listed in any live version to *live. void AddLiveFiles(std::vector* live_list); @@ -670,6 +713,7 @@ class VersionSet { void GetLiveFilesMetaData(std::vector *metadata); void GetObsoleteFiles(std::vector* files, + std::vector* manifest_filenames, uint64_t min_pending_output); ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } @@ -704,9 +748,6 @@ class VersionSet { void AppendVersion(ColumnFamilyData* column_family_data, Version* v); - bool ManifestContains(uint64_t manifest_file_number, - const std::string& record) const; - ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); @@ -717,6 +758,7 @@ class VersionSet { const DBOptions* const db_options_; std::atomic next_file_number_; uint64_t manifest_file_number_; + uint64_t options_file_number_; uint64_t pending_manifest_file_number_; std::atomic last_sequence_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted @@ -734,6 +776,7 @@ class VersionSet { uint64_t manifest_file_size_; std::vector obsolete_files_; + std::vector obsolete_manifests_; // env options for all reads and writes except compactions const EnvOptions& env_options_; diff --git a/external/rocksdb/db/version_set_test.cc b/external/rocksdb/db/version_set_test.cc index 6e513828ba..98b20a1109 100644 --- a/external/rocksdb/db/version_set_test.cc +++ b/external/rocksdb/db/version_set_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,7 +23,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { GenerateLevelFilesBriefTest() { } ~GenerateLevelFilesBriefTest() { - for (unsigned int i = 0; i < files_.size(); i++) { + for (size_t i = 0; i < files_.size(); i++) { delete files_[i]; } } diff --git a/external/rocksdb/db/wal_manager.cc b/external/rocksdb/db/wal_manager.cc index 37861ab45d..e57c120408 100644 --- a/external/rocksdb/db/wal_manager.cc +++ b/external/rocksdb/db/wal_manager.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -329,8 +329,8 @@ Status WalManager::GetSortedWalsOfType(const std::string& path, return s; } - log_files.push_back(std::move(std::unique_ptr( - new LogFileImpl(number, log_type, sequence, size_bytes)))); + log_files.push_back(std::unique_ptr( + new LogFileImpl(number, log_type, sequence, size_bytes))); } } CompareLogByPointer compare_log_files; @@ -383,7 +383,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type, Status s; if (type == kAliveLogFile) { std::string fname = LogFileName(db_options_.wal_dir, number); - s = ReadFirstLine(fname, sequence); + s = ReadFirstLine(fname, number, sequence); if (env_->FileExists(fname).ok() && !s.ok()) { // return any error that is not caused by non-existing file return s; @@ -394,7 +394,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type, // check if the file got moved to archive. std::string archived_file = ArchivedLogFileName(db_options_.wal_dir, number); - s = ReadFirstLine(archived_file, sequence); + s = ReadFirstLine(archived_file, number, sequence); // maybe the file was deleted from archive dir. If that's the case, return // Status::OK(). The caller with identify this as empty file because // *sequence == 0 @@ -413,6 +413,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type, // the function returns status.ok() and sequence == 0 if the file exists, but is // empty Status WalManager::ReadFirstLine(const std::string& fname, + const uint64_t number, SequenceNumber* sequence) { struct LogReporter : public log::Reader::Reporter { Env* env; @@ -448,14 +449,14 @@ Status WalManager::ReadFirstLine(const std::string& fname, reporter.fname = fname.c_str(); reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, number); std::string scratch; Slice record; if (reader.ReadRecord(&record, &scratch) && (status.ok() || !db_options_.paranoid_checks)) { - if (record.size() < 12) { + if (record.size() < WriteBatchInternal::kHeader) { reporter.Corruption(record.size(), Status::Corruption("log record too small")); // TODO read record's till the first no corrupt entry? diff --git a/external/rocksdb/db/wal_manager.h b/external/rocksdb/db/wal_manager.h index fc04863b26..d27985b9d6 100644 --- a/external/rocksdb/db/wal_manager.h +++ b/external/rocksdb/db/wal_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -54,9 +54,9 @@ class WalManager { return ReadFirstRecord(type, number, sequence); } - Status TEST_ReadFirstLine(const std::string& fname, + Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number, SequenceNumber* sequence) { - return ReadFirstLine(fname, sequence); + return ReadFirstLine(fname, number, sequence); } private: @@ -71,7 +71,8 @@ class WalManager { Status ReadFirstRecord(const WalFileType type, const uint64_t number, SequenceNumber* sequence); - Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence); + Status ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence); // ------- state from DBImpl ------ const DBOptions& db_options_; diff --git a/external/rocksdb/db/wal_manager_test.cc b/external/rocksdb/db/wal_manager_test.cc index ec56c9632d..9c7ac50bac 100644 --- a/external/rocksdb/db/wal_manager_test.cc +++ b/external/rocksdb/db/wal_manager_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,12 +10,12 @@ #include "rocksdb/cache.h" #include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" #include "db/wal_manager.h" #include "db/log_writer.h" #include "db/column_family.h" #include "db/version_set.h" -#include "db/writebuffer.h" #include "util/file_reader_writer.h" #include "util/mock_env.h" #include "util/string_util.h" @@ -34,7 +34,7 @@ class WalManagerTest : public testing::Test { : env_(new MockEnv(Env::Default())), dbname_(test::TmpDir() + "/wal_manager_test"), table_cache_(NewLRUCache(50000, 16)), - write_buffer_(db_options_.db_write_buffer_size), + write_buffer_manager_(db_options_.db_write_buffer_size), current_log_number_(0) { DestroyDB(dbname_, Options()); } @@ -48,7 +48,7 @@ class WalManagerTest : public testing::Test { db_options_.env = env_.get(); versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_, + table_cache_.get(), &write_buffer_manager_, &write_controller_)); wal_manager_.reset(new WalManager(db_options_, env_options_)); @@ -77,7 +77,7 @@ class WalManagerTest : public testing::Test { ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options_)); - current_log_writer_.reset(new log::Writer(std::move(file_writer))); + current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); } void CreateArchiveLogs(int num_logs, int entries_per_log) { @@ -95,7 +95,7 @@ class WalManagerTest : public testing::Test { Status status = wal_manager_->GetUpdatesSince( seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get()); EXPECT_OK(status); - return std::move(iter); + return iter; } std::unique_ptr env_; @@ -104,7 +104,7 @@ class WalManagerTest : public testing::Test { EnvOptions env_options_; std::shared_ptr table_cache_; DBOptions db_options_; - WriteBuffer write_buffer_; + WriteBufferManager write_buffer_manager_; std::unique_ptr versions_; std::unique_ptr wal_manager_; @@ -119,15 +119,17 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); SequenceNumber s; - ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, &s)); + ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s)); ASSERT_EQ(s, 0U); - ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_OK( + wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s)); ASSERT_EQ(s, 0U); unique_ptr file_writer( new WritableFileWriter(std::move(file), EnvOptions())); - log::Writer writer(std::move(file_writer)); + log::Writer writer(std::move(file_writer), 1, + db_options_.recycle_log_file_num > 0); WriteBatch batch; batch.Put("foo", "bar"); WriteBatchInternal::SetSequence(&batch, 10); @@ -183,7 +185,7 @@ std::vector ListSpecificFiles( } } } - return std::move(file_numbers); + return file_numbers; } int CountRecords(TransactionLogIterator* iter) { diff --git a/external/rocksdb/db/write_batch.cc b/external/rocksdb/db/write_batch.cc index 53431b92a0..61c2eb6f44 100644 --- a/external/rocksdb/db/write_batch.cc +++ b/external/rocksdb/db/write_batch.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,19 +20,28 @@ // kTypeColumnFamilyDeletion varint32 varstring varstring // kTypeColumnFamilySingleDeletion varint32 varstring varstring // kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeBeginPrepareXID varstring +// kTypeEndPrepareXID +// kTypeCommitXID varstring +// kTypeRollbackXID varstring +// kTypeNoop // varstring := // len: varint32 // data: uint8[len] #include "rocksdb/write_batch.h" +#include #include #include +#include #include "db/column_family.h" #include "db/db_impl.h" #include "db/dbformat.h" +#include "db/flush_scheduler.h" #include "db/memtable.h" +#include "db/merge_context.h" #include "db/snapshot_impl.h" #include "db/write_batch_internal.h" #include "rocksdb/merge_operator.h" @@ -42,30 +51,118 @@ namespace rocksdb { -// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. -static const size_t kHeader = 12; +// anon namespace for file-local types +namespace { + +enum ContentFlags : uint32_t { + DEFERRED = 1 << 0, + HAS_PUT = 1 << 1, + HAS_DELETE = 1 << 2, + HAS_SINGLE_DELETE = 1 << 3, + HAS_MERGE = 1 << 4, + HAS_BEGIN_PREPARE = 1 << 5, + HAS_END_PREPARE = 1 << 6, + HAS_COMMIT = 1 << 7, + HAS_ROLLBACK = 1 << 8, +}; + +struct BatchContentClassifier : public WriteBatch::Handler { + uint32_t content_flags = 0; + + Status PutCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_PUT; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_SINGLE_DELETE; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_MERGE; + return Status::OK(); + } + + Status MarkBeginPrepare() override { + content_flags |= ContentFlags::HAS_BEGIN_PREPARE; + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + content_flags |= ContentFlags::HAS_END_PREPARE; + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + content_flags |= ContentFlags::HAS_ROLLBACK; + return Status::OK(); + } +}; + +} // anon namespace + struct SavePoint { size_t size; // size of rep_ int count; // count of elements in rep_ - SavePoint(size_t s, int c) : size(s), count(c) {} + uint32_t content_flags; }; struct SavePoints { std::stack stack; }; -WriteBatch::WriteBatch(size_t reserved_bytes) : save_points_(nullptr) { - rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader); - Clear(); +WriteBatch::WriteBatch(size_t reserved_bytes) + : save_points_(nullptr), content_flags_(0), rep_() { + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? + reserved_bytes : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); } -WriteBatch::~WriteBatch() { - if (save_points_ != nullptr) { - delete save_points_; +WriteBatch::WriteBatch(const std::string& rep) + : save_points_(nullptr), + content_flags_(ContentFlags::DEFERRED), + rep_(rep) {} + +WriteBatch::WriteBatch(const WriteBatch& src) + : save_points_(src.save_points_), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + rep_(src.rep_) {} + +WriteBatch::WriteBatch(WriteBatch&& src) + : save_points_(std::move(src.save_points_)), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + rep_(std::move(src.rep_)) {} + +WriteBatch& WriteBatch::operator=(const WriteBatch& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(src); } + return *this; } +WriteBatch& WriteBatch::operator=(WriteBatch&& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(std::move(src)); + } + return *this; +} + +WriteBatch::~WriteBatch() { delete save_points_; } + WriteBatch::Handler::~Handler() { } void WriteBatch::Handler::LogData(const Slice& blob) { @@ -79,7 +176,9 @@ bool WriteBatch::Handler::Continue() { void WriteBatch::Clear() { rep_.clear(); - rep_.resize(kHeader); + rep_.resize(WriteBatchInternal::kHeader); + + content_flags_.store(0, std::memory_order_relaxed); if (save_points_ != nullptr) { while (!save_points_->stack.empty()) { @@ -92,9 +191,74 @@ int WriteBatch::Count() const { return WriteBatchInternal::Count(this); } +uint32_t WriteBatch::ComputeContentFlags() const { + auto rv = content_flags_.load(std::memory_order_relaxed); + if ((rv & ContentFlags::DEFERRED) != 0) { + BatchContentClassifier classifier; + Iterate(&classifier); + rv = classifier.content_flags; + + // this method is conceptually const, because it is performing a lazy + // computation that doesn't affect the abstract state of the batch. + // content_flags_ is marked mutable so that we can perform the + // following assignment + content_flags_.store(rv, std::memory_order_relaxed); + } + return rv; +} + +bool WriteBatch::HasPut() const { + return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; +} + +bool WriteBatch::HasDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0; +} + +bool WriteBatch::HasSingleDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0; +} + +bool WriteBatch::HasMerge() const { + return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0; +} + +bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) { + assert(input != nullptr && key != nullptr); + // Skip tag byte + input->remove_prefix(1); + + if (cf_record) { + // Skip column_family bytes + uint32_t cf; + if (!GetVarint32(input, &cf)) { + return false; + } + } + + // Extract key + return GetLengthPrefixedSlice(input, key); +} + +bool WriteBatch::HasBeginPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0; +} + +bool WriteBatch::HasEndPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0; +} + +bool WriteBatch::HasCommit() const { + return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0; +} + +bool WriteBatch::HasRollback() const { + return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0; +} + Status ReadRecordFromWriteBatch(Slice* input, char* tag, uint32_t* column_family, Slice* key, - Slice* value, Slice* blob) { + Slice* value, Slice* blob, Slice* xid) { assert(key != nullptr && value != nullptr); *tag = (*input)[0]; input->remove_prefix(1); @@ -140,6 +304,24 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag, return Status::Corruption("bad WriteBatch Blob"); } break; + case kTypeNoop: + case kTypeBeginPrepareXID: + break; + case kTypeEndPrepareXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad EndPrepare XID"); + } + break; + case kTypeCommitXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Commit XID"); + } + break; + case kTypeRollbackXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Rollback XID"); + } + break; default: return Status::Corruption("unknown WriteBatch tag"); } @@ -148,12 +330,12 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag, Status WriteBatch::Iterate(Handler* handler) const { Slice input(rep_); - if (input.size() < kHeader) { + if (input.size() < WriteBatchInternal::kHeader) { return Status::Corruption("malformed WriteBatch (too small)"); } - input.remove_prefix(kHeader); - Slice key, value, blob; + input.remove_prefix(WriteBatchInternal::kHeader); + Slice key, value, blob, xid; int found = 0; Status s; while (s.ok() && !input.empty() && handler->Continue()) { @@ -161,7 +343,7 @@ Status WriteBatch::Iterate(Handler* handler) const { uint32_t column_family = 0; // default s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, - &blob); + &blob, &xid); if (!s.ok()) { return s; } @@ -169,27 +351,57 @@ Status WriteBatch::Iterate(Handler* handler) const { switch (tag) { case kTypeColumnFamilyValue: case kTypeValue: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_PUT)); s = handler->PutCF(column_family, key, value); found++; break; case kTypeColumnFamilyDeletion: case kTypeDeletion: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE)); s = handler->DeleteCF(column_family, key); found++; break; case kTypeColumnFamilySingleDeletion: case kTypeSingleDeletion: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE)); s = handler->SingleDeleteCF(column_family, key); found++; break; case kTypeColumnFamilyMerge: case kTypeMerge: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE)); s = handler->MergeCF(column_family, key, value); found++; break; case kTypeLogData: handler->LogData(blob); break; + case kTypeBeginPrepareXID: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + handler->MarkBeginPrepare(); + break; + case kTypeEndPrepareXID: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); + handler->MarkEndPrepare(xid); + break; + case kTypeCommitXID: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + handler->MarkCommit(xid); + break; + case kTypeRollbackXID: + assert(content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); + handler->MarkRollback(xid); + break; + case kTypeNoop: + break; default: return Status::Corruption("unknown WriteBatch tag"); } @@ -220,7 +432,9 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { EncodeFixed64(&b->rep_[0], seq); } -size_t WriteBatchInternal::GetFirstOffset(WriteBatch* b) { return kHeader; } +size_t WriteBatchInternal::GetFirstOffset(WriteBatch* b) { + return WriteBatchInternal::kHeader; +} void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, const Slice& key, const Slice& value) { @@ -233,6 +447,9 @@ void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, } PutLengthPrefixedSlice(&b->rep_, key); PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); } void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, @@ -251,6 +468,9 @@ void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, } PutLengthPrefixedSliceParts(&b->rep_, key); PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); } void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, @@ -258,6 +478,47 @@ void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); } +void WriteBatchInternal::InsertNoop(WriteBatch* b) { + b->rep_.push_back(static_cast(kTypeNoop)); +} + +void WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) { + // a manually constructed batch can only contain one prepare section + assert(b->rep_[12] == static_cast(kTypeNoop)); + + // all savepoints up to this point are cleared + if (b->save_points_ != nullptr) { + while (!b->save_points_->stack.empty()) { + b->save_points_->stack.pop(); + } + } + + // rewrite noop as begin marker + b->rep_[12] = static_cast(kTypeBeginPrepareXID); + b->rep_.push_back(static_cast(kTypeEndPrepareXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_END_PREPARE | + ContentFlags::HAS_BEGIN_PREPARE, + std::memory_order_relaxed); +} + +void WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeCommitXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); +} + +void WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeRollbackXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_ROLLBACK, + std::memory_order_relaxed); +} + void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, const Slice& key) { WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); @@ -268,6 +529,9 @@ void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, PutVarint32(&b->rep_, column_family_id); } PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); } void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { @@ -284,6 +548,9 @@ void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, PutVarint32(&b->rep_, column_family_id); } PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); } void WriteBatch::Delete(ColumnFamilyHandle* column_family, @@ -301,6 +568,9 @@ void WriteBatchInternal::SingleDelete(WriteBatch* b, uint32_t column_family_id, PutVarint32(&b->rep_, column_family_id); } PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); } void WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, @@ -318,6 +588,9 @@ void WriteBatchInternal::SingleDelete(WriteBatch* b, uint32_t column_family_id, PutVarint32(&b->rep_, column_family_id); } PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); } void WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, @@ -336,6 +609,9 @@ void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, } PutLengthPrefixedSlice(&b->rep_, key); PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); } void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, @@ -355,6 +631,9 @@ void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, } PutLengthPrefixedSliceParts(&b->rep_, key); PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); } void WriteBatch::Merge(ColumnFamilyHandle* column_family, @@ -374,7 +653,8 @@ void WriteBatch::SetSavePoint() { save_points_ = new SavePoints(); } // Record length and count of current batch of writes. - save_points_->stack.push(SavePoint(GetDataSize(), Count())); + save_points_->stack.push(SavePoint{ + GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)}); } Status WriteBatch::RollbackToSavePoint() { @@ -387,6 +667,7 @@ Status WriteBatch::RollbackToSavePoint() { save_points_->stack.pop(); assert(savepoint.size <= rep_.size()); + assert(savepoint.count <= Count()); if (savepoint.size == rep_.size()) { // No changes to rollback @@ -396,41 +677,64 @@ Status WriteBatch::RollbackToSavePoint() { } else { rep_.resize(savepoint.size); WriteBatchInternal::SetCount(this, savepoint.count); + content_flags_.store(savepoint.content_flags, std::memory_order_relaxed); } return Status::OK(); } -namespace { -// This class can *only* be used from a single-threaded write thread, because it -// calls ColumnFamilyMemTablesImpl::Seek() class MemTableInserter : public WriteBatch::Handler { public: SequenceNumber sequence_; - ColumnFamilyMemTables* cf_mems_; - bool ignore_missing_column_families_; - uint64_t log_number_; + ColumnFamilyMemTables* const cf_mems_; + FlushScheduler* const flush_scheduler_; + const bool ignore_missing_column_families_; + const uint64_t recovering_log_number_; + // log number that all Memtables inserted into should reference + uint64_t log_number_ref_; DBImpl* db_; - const bool dont_filter_deletes_; - + const bool concurrent_memtable_writes_; + bool* has_valid_writes_; + typedef std::map MemPostInfoMap; + MemPostInfoMap mem_post_info_map_; + // current recovered transaction we are rebuilding (recovery) + WriteBatch* rebuilding_trx_; + + // cf_mems should not be shared with concurrent inserters MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, - bool ignore_missing_column_families, uint64_t log_number, - DB* db, const bool dont_filter_deletes) + FlushScheduler* flush_scheduler, + bool ignore_missing_column_families, + uint64_t recovering_log_number, DB* db, + bool concurrent_memtable_writes, + bool* has_valid_writes = nullptr) : sequence_(sequence), cf_mems_(cf_mems), + flush_scheduler_(flush_scheduler), ignore_missing_column_families_(ignore_missing_column_families), - log_number_(log_number), + recovering_log_number_(recovering_log_number), + log_number_ref_(0), db_(reinterpret_cast(db)), - dont_filter_deletes_(dont_filter_deletes) { - assert(cf_mems); - if (!dont_filter_deletes_) { - assert(db_); + concurrent_memtable_writes_(concurrent_memtable_writes), + has_valid_writes_(has_valid_writes), + rebuilding_trx_(nullptr) { + assert(cf_mems_); + } + + void set_log_number_ref(uint64_t log) { log_number_ref_ = log; } + + SequenceNumber get_final_sequence() { return sequence_; } + + void PostProcess() { + for (auto& pair : mem_post_info_map_) { + pair.first->BatchPostProcess(pair.second); } } bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { - // We are only allowed to call this from a single-threaded write thread - // (or while holding DB mutex) + // If we are in a concurrent mode, it is the caller's responsibility + // to clone the original ColumnFamilyMemTables so that each thread + // has its own instance. Otherwise, it must be guaranteed that there + // is no concurrent access bool found = cf_mems_->Seek(column_family_id); if (!found) { if (ignore_missing_column_families_) { @@ -441,33 +745,55 @@ class MemTableInserter : public WriteBatch::Handler { } return false; } - if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) { - // This is true only in recovery environment (log_number_ is always 0 in + if (recovering_log_number_ != 0 && + recovering_log_number_ < cf_mems_->GetLogNumber()) { + // This is true only in recovery environment (recovering_log_number_ is + // always 0 in // non-recovery, regular write code-path) - // * If log_number_ < cf_mems_->GetLogNumber(), this means that column + // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that + // column // family already contains updates from this log. We can't apply updates // twice because of update-in-place or merge workloads -- ignore the // update *s = Status::OK(); return false; } + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + + if (log_number_ref_ > 0) { + cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_); + } + return true; } + virtual Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { + if (rebuilding_trx_ != nullptr) { + WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + return Status::OK(); + } + Status seek_status; if (!SeekToColumnFamily(column_family_id, &seek_status)) { ++sequence_; return seek_status; } + MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetMemTableOptions(); if (!moptions->inplace_update_support) { - mem->Add(sequence_, kTypeValue, key, value); + mem->Add(sequence_, kTypeValue, key, value, concurrent_memtable_writes_, + get_post_process_info(mem)); } else if (moptions->inplace_callback == nullptr) { + assert(!concurrent_memtable_writes_); mem->Update(sequence_, key, value); RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED); } else { + assert(!concurrent_memtable_writes_); if (mem->UpdateCallback(sequence_, key, value)) { } else { // key not found in memtable. Do sst get, update, add @@ -505,77 +831,66 @@ class MemTableInserter : public WriteBatch::Handler { // sequence number. Even if the update eventually fails and does not result // in memtable add/update. sequence_++; - cf_mems_->CheckMemtableFull(); + CheckMemtableFull(); + return Status::OK(); + } + + Status DeleteImpl(uint32_t column_family_id, const Slice& key, + ValueType delete_type) { + MemTable* mem = cf_mems_->GetMemTable(); + mem->Add(sequence_, delete_type, key, Slice(), concurrent_memtable_writes_, + get_post_process_info(mem)); + sequence_++; + CheckMemtableFull(); return Status::OK(); } virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + if (rebuilding_trx_ != nullptr) { + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + return Status::OK(); + } + Status seek_status; if (!SeekToColumnFamily(column_family_id, &seek_status)) { ++sequence_; return seek_status; } - MemTable* mem = cf_mems_->GetMemTable(); - auto* moptions = mem->GetMemTableOptions(); - if (!dont_filter_deletes_ && moptions->filter_deletes) { - SnapshotImpl read_from_snapshot; - read_from_snapshot.number_ = sequence_; - ReadOptions ropts; - ropts.snapshot = &read_from_snapshot; - std::string value; - auto cf_handle = cf_mems_->GetColumnFamilyHandle(); - if (cf_handle == nullptr) { - cf_handle = db_->DefaultColumnFamily(); - } - if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { - RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES); - return Status::OK(); - } - } - mem->Add(sequence_, kTypeDeletion, key, Slice()); - sequence_++; - cf_mems_->CheckMemtableFull(); - return Status::OK(); + + return DeleteImpl(column_family_id, key, kTypeDeletion); } virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + if (rebuilding_trx_ != nullptr) { + WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); + return Status::OK(); + } + Status seek_status; if (!SeekToColumnFamily(column_family_id, &seek_status)) { ++sequence_; return seek_status; } - MemTable* mem = cf_mems_->GetMemTable(); - auto* moptions = mem->GetMemTableOptions(); - if (!dont_filter_deletes_ && moptions->filter_deletes) { - SnapshotImpl read_from_snapshot; - read_from_snapshot.number_ = sequence_; - ReadOptions ropts; - ropts.snapshot = &read_from_snapshot; - std::string value; - auto cf_handle = cf_mems_->GetColumnFamilyHandle(); - if (cf_handle == nullptr) { - cf_handle = db_->DefaultColumnFamily(); - } - if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { - RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES); - return Status::OK(); - } - } - mem->Add(sequence_, kTypeSingleDeletion, key, Slice()); - sequence_++; - cf_mems_->CheckMemtableFull(); - return Status::OK(); + + return DeleteImpl(column_family_id, key, kTypeSingleDeletion); } virtual Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { + assert(!concurrent_memtable_writes_); + if (rebuilding_trx_ != nullptr) { + WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); + return Status::OK(); + } + Status seek_status; if (!SeekToColumnFamily(column_family_id, &seek_status)) { ++sequence_; return seek_status; } + MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetMemTableOptions(); bool perform_merge = false; @@ -614,23 +929,14 @@ class MemTableInserter : public WriteBatch::Handler { auto merge_operator = moptions->merge_operator; assert(merge_operator); - std::deque operands; - operands.push_front(value.ToString()); std::string new_value; - bool merge_success = false; - { - StopWatchNano timer(Env::Default(), moptions->statistics != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - merge_success = merge_operator->FullMerge( - key, &get_value_slice, operands, &new_value, moptions->info_log); - RecordTick(moptions->statistics, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanos()); - } - if (!merge_success) { - // Failed to merge! - RecordTick(moptions->statistics, NUMBER_MERGE_FAILURES); + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, &get_value_slice, {value}, &new_value, + moptions->info_log, moptions->statistics, Env::Default()); + if (!merge_status.ok()) { + // Failed to merge! // Store the delta in memtable perform_merge = false; } else { @@ -645,37 +951,223 @@ class MemTableInserter : public WriteBatch::Handler { } sequence_++; - cf_mems_->CheckMemtableFull(); + CheckMemtableFull(); + return Status::OK(); + } + + void CheckMemtableFull() { + if (flush_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + assert(cfd != nullptr); + if (cfd->mem()->ShouldScheduleFlush() && + cfd->mem()->MarkFlushScheduled()) { + // MarkFlushScheduled only returns true if we are the one that + // should take action, so no need to dedup further + flush_scheduler_->ScheduleFlush(cfd); + } + } + } + + Status MarkBeginPrepare() override { + assert(rebuilding_trx_ == nullptr); + assert(db_); + + if (recovering_log_number_ != 0) { + // during recovery we rebuild a hollow transaction + // from all encountered prepare sections of the wal + if (db_->allow_2pc() == false) { + return Status::NotSupported( + "WAL contains prepared transactions. Open with " + "TransactionDB::Open()."); + } + + // we are now iterating through a prepared section + rebuilding_trx_ = new WriteBatch(); + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } else { + // in non-recovery we ignore prepare markers + // and insert the values directly. making sure we have a + // log for each insertion to reference. + assert(log_number_ref_ > 0); + } + + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& name) override { + assert(db_); + assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0)); + + if (recovering_log_number_ != 0) { + assert(db_->allow_2pc()); + db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(), + rebuilding_trx_); + rebuilding_trx_ = nullptr; + } else { + assert(rebuilding_trx_ == nullptr); + assert(log_number_ref_ > 0); + } + + return Status::OK(); + } + + Status MarkCommit(const Slice& name) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log contaiting the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx != nullptr) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(log_number_ref_ == 0); + // all insertes must reference this trx log number + log_number_ref_ = trx->log_number_; + s = trx->batch_->Iterate(this); + log_number_ref_ = 0; + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + } else { + // in non recovery we simply ignore this tag + } + + return s; + } + + Status MarkRollback(const Slice& name) override { + assert(db_); + + if (recovering_log_number_ != 0) { + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the transactions prep section + // may have been released in the previous incarnation + // because we knew it had been rolled back + if (trx != nullptr) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + } else { + // in non recovery we simply ignore this tag + } + return Status::OK(); } + + private: + MemTablePostProcessInfo* get_post_process_info(MemTable* mem) { + if (!concurrent_memtable_writes_) { + // No need to batch counters locally if we don't use concurrent mode. + return nullptr; + } + return &mem_post_info_map_[mem]; + } }; -} // namespace // This function can only be called in these conditions: // 1) During Recovery() -// 2) during Write(), in a single-threaded write thread -// The reason is that it calles ColumnFamilyMemTablesImpl::Seek(), which needs -// to be called from a single-threaded write thread (or while holding DB mutex) -Status WriteBatchInternal::InsertInto(const WriteBatch* b, +// 2) During Write(), in a single-threaded write thread +// 3) During Write(), in a concurrent context where memtables has been cloned +// The reason is that it calls memtables->Seek(), which has a stateful cache +Status WriteBatchInternal::InsertInto( + const autovector& writers, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes) { + MemTableInserter inserter(sequence, memtables, flush_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes); + for (size_t i = 0; i < writers.size(); i++) { + auto w = writers[i]; + if (!w->ShouldWriteToMemtable()) { + continue; + } + inserter.set_log_number_ref(w->log_ref); + w->status = w->batch->Iterate(&inserter); + if (!w->status.ok()) { + return w->status; + } + } + return Status::OK(); +} + +Status WriteBatchInternal::InsertInto(WriteThread::Writer* writer, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, bool ignore_missing_column_families, uint64_t log_number, DB* db, - const bool dont_filter_deletes) { - MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, + bool concurrent_memtable_writes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(writer->batch), + memtables, flush_scheduler, ignore_missing_column_families, log_number, db, - dont_filter_deletes); - return b->Iterate(&inserter); + concurrent_memtable_writes); + assert(writer->ShouldWriteToMemtable()); + inserter.set_log_number_ref(writer->log_ref); + Status s = writer->batch->Iterate(&inserter); + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +Status WriteBatchInternal::InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, bool ignore_missing_column_families, + uint64_t log_number, DB* db, bool concurrent_memtable_writes, + SequenceNumber* last_seq_used, bool* has_valid_writes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(batch), memtables, + flush_scheduler, ignore_missing_column_families, + log_number, db, concurrent_memtable_writes, + has_valid_writes); + Status s = batch->Iterate(&inserter); + if (last_seq_used != nullptr) { + *last_seq_used = inserter.get_final_sequence(); + } + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; } void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { - assert(contents.size() >= kHeader); + assert(contents.size() >= WriteBatchInternal::kHeader); b->rep_.assign(contents.data(), contents.size()); + b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed); } void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { SetCount(dst, Count(dst) + Count(src)); - assert(src->rep_.size() >= kHeader); - dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); + assert(src->rep_.size() >= WriteBatchInternal::kHeader); + dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, + src->rep_.size() - WriteBatchInternal::kHeader); + dst->content_flags_.store( + dst->content_flags_.load(std::memory_order_relaxed) | + src->content_flags_.load(std::memory_order_relaxed), + std::memory_order_relaxed); +} + +size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize, + size_t rightByteSize) { + if (leftByteSize == 0 || rightByteSize == 0) { + return leftByteSize + rightByteSize; + } else { + return leftByteSize + rightByteSize - WriteBatchInternal::kHeader; + } } } // namespace rocksdb diff --git a/external/rocksdb/db/write_batch_base.cc b/external/rocksdb/db/write_batch_base.cc index 9f7f00d2cf..3936fbd922 100644 --- a/external/rocksdb/db/write_batch_base.cc +++ b/external/rocksdb/db/write_batch_base.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/db/write_batch_internal.h b/external/rocksdb/db/write_batch_internal.h index 04db461a06..cc036cbf6b 100644 --- a/external/rocksdb/db/write_batch_internal.h +++ b/external/rocksdb/db/write_batch_internal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,14 +8,19 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include +#include "db/write_thread.h" #include "rocksdb/types.h" #include "rocksdb/write_batch.h" #include "rocksdb/db.h" #include "rocksdb/options.h" +#include "util/autovector.h" namespace rocksdb { class MemTable; +class FlushScheduler; +class ColumnFamilyData; class ColumnFamilyMemTables { public: @@ -27,7 +32,7 @@ class ColumnFamilyMemTables { virtual uint64_t GetLogNumber() const = 0; virtual MemTable* GetMemTable() const = 0; virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; - virtual void CheckMemtableFull() = 0; + virtual ColumnFamilyData* current() { return nullptr; } }; class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { @@ -49,8 +54,6 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } - void CheckMemtableFull() override {} - private: bool ok_; MemTable* mem_; @@ -60,6 +63,10 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { public: + + // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. + static const size_t kHeader = 12; + // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* static void Put(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const Slice& value); @@ -85,6 +92,14 @@ class WriteBatchInternal { static void Merge(WriteBatch* batch, uint32_t column_family_id, const SliceParts& key, const SliceParts& value); + static void MarkEndPrepare(WriteBatch* batch, const Slice& xid); + + static void MarkRollback(WriteBatch* batch, const Slice& xid); + + static void MarkCommit(WriteBatch* batch, const Slice& xid); + + static void InsertNoop(WriteBatch* batch); + // Return the number of entries in the batch. static int Count(const WriteBatch* batch); @@ -112,24 +127,54 @@ class WriteBatchInternal { static void SetContents(WriteBatch* batch, const Slice& contents); - // Inserts batch entries into memtable - // If dont_filter_deletes is false AND options.filter_deletes is true, - // then --> Drops deletes in batch if db->KeyMayExist returns false - // If ignore_missing_column_families == true. WriteBatch referencing - // non-existing column family should be ignored. - // However, if ignore_missing_column_families == false, any WriteBatch - // referencing non-existing column family will return a InvalidArgument() - // failure. + // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive. + // + // If ignore_missing_column_families == true. WriteBatch + // referencing non-existing column family will be ignored. + // If ignore_missing_column_families == false, processing of the + // batches will be stopped if a reference is found to a non-existing + // column family and InvalidArgument() will be returned. The writes + // in batches may be only partially applied at that point. // // If log_number is non-zero, the memtable will be updated only if - // memtables->GetLogNumber() >= log_number + // memtables->GetLogNumber() >= log_number. + // + // If flush_scheduler is non-null, it will be invoked if the memtable + // should be flushed. + // + // Under concurrent use, the caller is responsible for making sure that + // the memtables object itself is thread-local. + static Status InsertInto(const autovector& batches, + SequenceNumber sequence, + ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, + bool concurrent_memtable_writes = false); + + // Convenience form of InsertInto when you have only one batch + // last_seq_used returns the last sequnce number used in a MemTable insert static Status InsertInto(const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, - const bool dont_filter_deletes = true); + bool concurrent_memtable_writes = false, + SequenceNumber* last_seq_used = nullptr, + bool* has_valid_writes = nullptr); + + static Status InsertInto(WriteThread::Writer* writer, + ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, + bool concurrent_memtable_writes = false); static void Append(WriteBatch* dst, const WriteBatch* src); + + // Returns the byte size of appending a WriteBatch with ByteSize + // leftByteSize and a WriteBatch with ByteSize rightByteSize + static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); }; } // namespace rocksdb diff --git a/external/rocksdb/db/write_batch_test.cc b/external/rocksdb/db/write_batch_test.cc index d8c6f8cb06..cc351a5a6b 100644 --- a/external/rocksdb/db/write_batch_test.cc +++ b/external/rocksdb/db/write_batch_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,17 +10,17 @@ #include "rocksdb/db.h" #include -#include "db/memtable.h" #include "db/column_family.h" +#include "db/memtable.h" #include "db/write_batch_internal.h" -#include "db/writebuffer.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" #include "util/logging.h" #include "util/string_util.h" #include "util/testharness.h" -#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -30,15 +30,19 @@ static std::string PrintContents(WriteBatch* b) { Options options; options.memtable_factory = factory; ImmutableCFOptions ioptions(options); - WriteBuffer wb(options.db_write_buffer_size); + WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb, kMaxSequenceNumber); mem->Ref(); std::string state; ColumnFamilyMemTablesDefault cf_mems_default(mem); - Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); + Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr); int count = 0; + int put_count = 0; + int delete_count = 0; + int single_delete_count = 0; + int merge_count = 0; Arena arena; ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -53,18 +57,21 @@ static std::string PrintContents(WriteBatch* b) { state.append(iter->value().ToString()); state.append(")"); count++; + put_count++; break; case kTypeDeletion: state.append("Delete("); state.append(ikey.user_key.ToString()); state.append(")"); count++; + delete_count++; break; case kTypeSingleDeletion: state.append("SingleDelete("); state.append(ikey.user_key.ToString()); state.append(")"); count++; + single_delete_count++; break; case kTypeMerge: state.append("Merge("); @@ -73,6 +80,7 @@ static std::string PrintContents(WriteBatch* b) { state.append(iter->value().ToString()); state.append(")"); count++; + merge_count++; break; default: assert(false); @@ -81,6 +89,10 @@ static std::string PrintContents(WriteBatch* b) { state.append("@"); state.append(NumberToString(ikey.sequence)); } + EXPECT_EQ(b->HasPut(), put_count > 0); + EXPECT_EQ(b->HasDelete(), delete_count > 0); + EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); + EXPECT_EQ(b->HasMerge(), merge_count > 0); if (!s.ok()) { state.append(s.ToString()); } else if (count != WriteBatchInternal::Count(b)) { @@ -219,6 +231,22 @@ namespace { virtual void LogData(const Slice& blob) override { seen += "LogData(" + blob.ToString() + ")"; } + virtual Status MarkBeginPrepare() override { + seen += "MarkBeginPrepare()"; + return Status::OK(); + } + virtual Status MarkEndPrepare(const Slice& xid) override { + seen += "MarkEndPrepare(" + xid.ToString() + ")"; + return Status::OK(); + } + virtual Status MarkCommit(const Slice& xid) override { + seen += "MarkCommit(" + xid.ToString() + ")"; + return Status::OK(); + } + virtual Status MarkRollback(const Slice& xid) override { + seen += "MarkRollback(" + xid.ToString() + ")"; + return Status::OK(); + } }; } @@ -296,6 +324,145 @@ TEST_F(WriteBatchTest, Blob) { handler.seen); } +TEST_F(WriteBatchTest, PrepareCommit) { + WriteBatch batch; + WriteBatchInternal::InsertNoop(&batch); + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.SetSavePoint(); + WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")); + Status s = batch.RollbackToSavePoint(); + ASSERT_EQ(s, Status::NotFound()); + WriteBatchInternal::MarkCommit(&batch, Slice("xid1")); + WriteBatchInternal::MarkRollback(&batch, Slice("xid1")); + ASSERT_EQ(2, batch.Count()); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "MarkBeginPrepare()" + "Put(k1, v1)" + "Put(k2, v2)" + "MarkEndPrepare(xid1)" + "MarkCommit(xid1)" + "MarkRollback(xid1)", + handler.seen); +} + +// It requires more than 30GB of memory to run the test. With single memory +// allocation of more than 30GB. +// Not all platform can run it. Also it runs a long time. So disable it. +TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 4u; + static const uint32_t kNumUpdates = 3 << 30; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u); + char c = 'A'; + for (uint32_t i = 0; i < kNumUpdates; i++) { + if (c > 'Z') { + c = 'A'; + } + raw[0] = c; + raw[raw.length() - 1] = c; + c++; + batch.Put(raw, raw); + } + + ASSERT_EQ(kNumUpdates, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + uint32_t num_seen = 0; + char expected_char = 'A'; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ(expected_char, key[0]); + EXPECT_EQ(expected_char, value[0]); + EXPECT_EQ(expected_char, key[kKeyValueSize - 1]); + EXPECT_EQ(expected_char, value[kKeyValueSize - 1]); + expected_char++; + if (expected_char > 'Z') { + expected_char = 'A'; + } + ++num_seen; + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual void LogData(const Slice& blob) override { EXPECT_TRUE(false); } + virtual bool Continue() override { return num_seen < kNumUpdates; } + } handler; + + batch.Iterate(&handler); + ASSERT_EQ(kNumUpdates, handler.num_seen); +} + +// The test requires more than 18GB memory to run it, with single memory +// allocation of more than 12GB. Not all the platform can run it. So disable it. +TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 3221225472u; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(12884901888u + 1024u); + for (char i = 0; i < 2; i++) { + raw[0] = 'A' + i; + raw[raw.length() - 1] = 'A' - i; + batch.Put(raw, raw); + } + + ASSERT_EQ(2, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + int num_seen = 0; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ('A' + num_seen, key[0]); + EXPECT_EQ('A' + num_seen, value[0]); + EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]); + EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]); + ++num_seen; + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual void LogData(const Slice& blob) override { EXPECT_TRUE(false); } + virtual bool Continue() override { return num_seen < 2; } + } handler; + + batch.Iterate(&handler); + ASSERT_EQ(2, handler.num_seen); +} + TEST_F(WriteBatchTest, Continue) { WriteBatch batch; diff --git a/external/rocksdb/db/write_callback.h b/external/rocksdb/db/write_callback.h index 7dcca96fe5..93c80d6510 100644 --- a/external/rocksdb/db/write_callback.h +++ b/external/rocksdb/db/write_callback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,9 @@ class WriteCallback { // this function returns a non-OK status, the write will be aborted and this // status will be returned to the caller of DB::Write(). virtual Status Callback(DB* db) = 0; + + // return true if writes with this callback can be batched with other writes + virtual bool AllowWriteBatching() = 0; }; } // namespace rocksdb diff --git a/external/rocksdb/db/write_callback_test.cc b/external/rocksdb/db/write_callback_test.cc index 47b7cf72a3..33aaab7f44 100644 --- a/external/rocksdb/db/write_callback_test.cc +++ b/external/rocksdb/db/write_callback_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,12 +6,15 @@ #ifndef ROCKSDB_LITE #include +#include +#include #include "db/db_impl.h" #include "db/write_callback.h" #include "rocksdb/db.h" #include "rocksdb/write_batch.h" #include "util/logging.h" +#include "util/sync_point.h" #include "util/testharness.h" using std::string; @@ -42,6 +45,8 @@ class WriteCallbackTestWriteCallback1 : public WriteCallback { return Status::OK(); } + + bool AllowWriteBatching() override { return true; } }; class WriteCallbackTestWriteCallback2 : public WriteCallback { @@ -49,8 +54,227 @@ class WriteCallbackTestWriteCallback2 : public WriteCallback { Status Callback(DB *db) override { return Status::Busy(); } + bool AllowWriteBatching() override { return true; } }; +class MockWriteCallback : public WriteCallback { + public: + bool should_fail_ = false; + bool was_called_ = false; + bool allow_batching_ = false; + + Status Callback(DB* db) override { + was_called_ = true; + if (should_fail_) { + return Status::Busy(); + } else { + return Status::OK(); + } + } + + bool AllowWriteBatching() override { return allow_batching_; } +}; + +TEST_F(WriteCallbackTest, WriteWithCallbackTest) { + struct WriteOP { + WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; } + + void Put(const string& key, const string& val) { + kvs_.push_back(std::make_pair(key, val)); + write_batch_.Put(key, val); + } + + void Clear() { + kvs_.clear(); + write_batch_.Clear(); + callback_.was_called_ = false; + } + + MockWriteCallback callback_; + WriteBatch write_batch_; + std::vector> kvs_; + }; + + std::vector> write_scenarios = { + {true}, + {false}, + {false, false}, + {true, true}, + {true, false}, + {false, true}, + {false, false, false}, + {true, true, true}, + {false, true, false}, + {true, false, true}, + {true, false, false, false, false}, + {false, false, false, false, true}, + {false, false, true, false, true}, + }; + + for (auto& allow_parallel : {true, false}) { + for (auto& allow_batching : {true, false}) { + for (auto& enable_WAL : {true, false}) { + for (auto& write_group : write_scenarios) { + Options options; + options.create_if_missing = true; + options.allow_concurrent_memtable_write = allow_parallel; + + ReadOptions read_options; + DB* db; + DBImpl* db_impl; + + DestroyDB(dbname, options); + ASSERT_OK(DB::Open(options, dbname, &db)); + + db_impl = dynamic_cast(db); + ASSERT_TRUE(db_impl); + + std::atomic threads_waiting(0); + std::atomic seq(db_impl->GetLatestSequenceNumber()); + ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + uint64_t cur_threads_waiting = 0; + bool is_leader = false; + bool is_last = false; + + // who am i + do { + cur_threads_waiting = threads_waiting.load(); + is_leader = (cur_threads_waiting == 0); + is_last = (cur_threads_waiting == write_group.size() - 1); + } while (!threads_waiting.compare_exchange_strong( + cur_threads_waiting, cur_threads_waiting + 1)); + + // check my state + auto* writer = reinterpret_cast(arg); + + if (is_leader) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else { + ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT); + } + + // (meta test) the first WriteOP should indeed be the first + // and the last should be the last (all others can be out of + // order) + if (is_leader) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.front().callback_.should_fail_); + } else if (is_last) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.back().callback_.should_fail_); + } + + // wait for friends + while (threads_waiting.load() < write_group.size()) { + } + }); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { + // check my state + auto* writer = reinterpret_cast(arg); + + if (!allow_batching) { + // no batching so everyone should be a leader + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else if (!allow_parallel) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_COMPLETED); + } + }); + + std::atomic thread_num(0); + std::atomic dummy_key(0); + std::function write_with_callback_func = [&]() { + uint32_t i = thread_num.fetch_add(1); + Random rnd(i); + + // leaders gotta lead + while (i > 0 && threads_waiting.load() < 1) { + } + + // loser has to lose + while (i == write_group.size() - 1 && + threads_waiting.load() < write_group.size() - 1) { + } + + auto& write_op = write_group.at(i); + write_op.Clear(); + write_op.callback_.allow_batching_ = allow_batching; + + // insert some keys + for (uint32_t j = 0; j < rnd.Next() % 50; j++) { + // grab unique key + char my_key = 0; + do { + my_key = dummy_key.load(); + } while (!dummy_key.compare_exchange_strong(my_key, my_key + 1)); + + string skey(5, my_key); + string sval(10, my_key); + write_op.Put(skey, sval); + + if (!write_op.callback_.should_fail_) { + seq.fetch_add(1); + } + } + + WriteOptions woptions; + woptions.disableWAL = !enable_WAL; + woptions.sync = enable_WAL; + Status s = db_impl->WriteWithCallback( + woptions, &write_op.write_batch_, &write_op.callback_); + + if (write_op.callback_.should_fail_) { + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + }; + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector threads; + for (uint32_t i = 0; i < write_group.size(); i++) { + threads.emplace_back(write_with_callback_func); + } + for (auto& t : threads) { + t.join(); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + // check for keys + string value; + for (auto& w : write_group) { + ASSERT_TRUE(w.callback_.was_called_); + for (auto& kvp : w.kvs_) { + if (w.callback_.should_fail_) { + ASSERT_TRUE( + db->Get(read_options, kvp.first, &value).IsNotFound()); + } else { + ASSERT_OK(db->Get(read_options, kvp.first, &value)); + ASSERT_EQ(value, kvp.second); + } + } + } + + ASSERT_EQ(seq.load(), db_impl->GetLatestSequenceNumber()); + + delete db; + DestroyDB(dbname, options); + } + } + } + } +} + TEST_F(WriteCallbackTest, WriteCallBackTest) { Options options; WriteOptions write_options; @@ -59,6 +283,8 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { DB* db; DBImpl* db_impl; + DestroyDB(dbname, options); + options.create_if_missing = true; Status s = DB::Open(options, dbname, &db); ASSERT_OK(s); diff --git a/external/rocksdb/db/write_controller.cc b/external/rocksdb/db/write_controller.cc index c26f6fbc45..d46d8d3ddc 100644 --- a/external/rocksdb/db/write_controller.cc +++ b/external/rocksdb/db/write_controller.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,16 +16,25 @@ std::unique_ptr WriteController::GetStopToken() { return std::unique_ptr(new StopWriteToken(this)); } -std::unique_ptr WriteController::GetDelayToken() { - if (total_delayed_++ == 0) { - last_refill_time_ = 0; - bytes_left_ = 0; - } +std::unique_ptr WriteController::GetDelayToken( + uint64_t write_rate) { + total_delayed_++; + // Reset counters. + last_refill_time_ = 0; + bytes_left_ = 0; + set_delayed_write_rate(write_rate); return std::unique_ptr(new DelayWriteToken(this)); } +std::unique_ptr +WriteController::GetCompactionPressureToken() { + ++total_compaction_pressure_; + return std::unique_ptr( + new CompactionPressureToken(this)); +} + bool WriteController::IsStopped() const { return total_stopped_ > 0; } -// Tihs is inside DB mutex, so we can't sleep and need to minimize +// This is inside DB mutex, so we can't sleep and need to minimize // frequency to get time. // If it turns out to be a performance issue, we can redesign the thread // synchronization model here. @@ -104,4 +113,9 @@ DelayWriteToken::~DelayWriteToken() { assert(controller_->total_delayed_ >= 0); } +CompactionPressureToken::~CompactionPressureToken() { + controller_->total_compaction_pressure_--; + assert(controller_->total_compaction_pressure_ >= 0); +} + } // namespace rocksdb diff --git a/external/rocksdb/db/write_controller.h b/external/rocksdb/db/write_controller.h index 50e5a99beb..0520471774 100644 --- a/external/rocksdb/db/write_controller.h +++ b/external/rocksdb/db/write_controller.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,12 +20,13 @@ class WriteControllerToken; // to be called while holding DB mutex class WriteController { public: - explicit WriteController(uint64_t delayed_write_rate = 1024u * 1024u * 32u) + explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u) : total_stopped_(0), total_delayed_(0), + total_compaction_pressure_(0), bytes_left_(0), last_refill_time_(0) { - set_delayed_write_rate(delayed_write_rate); + set_delayed_write_rate(_delayed_write_rate); } ~WriteController() = default; @@ -36,30 +37,40 @@ class WriteController { // writes to the DB will be controlled under the delayed write rate. Every // write needs to call GetDelay() with number of bytes writing to the DB, // which returns number of microseconds to sleep. - std::unique_ptr GetDelayToken(); + std::unique_ptr GetDelayToken( + uint64_t delayed_write_rate); + // When an actor (column family) requests a moderate token, compaction + // threads will be increased + std::unique_ptr GetCompactionPressureToken(); - // these two metods are querying the state of the WriteController + // these three metods are querying the state of the WriteController bool IsStopped() const; bool NeedsDelay() const { return total_delayed_ > 0; } + bool NeedSpeedupCompaction() const { + return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0; + } // return how many microseconds the caller needs to sleep after the call // num_bytes: how many number of bytes to put into the DB. // Prerequisite: DB mutex held. uint64_t GetDelay(Env* env, uint64_t num_bytes); - void set_delayed_write_rate(uint64_t delayed_write_rate) { - delayed_write_rate_ = delayed_write_rate; - if (delayed_write_rate_ == 0) { - // avoid divide 0 - delayed_write_rate_ = 1U; + void set_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; } + delayed_write_rate_ = write_rate; } + uint64_t delayed_write_rate() const { return delayed_write_rate_; } private: friend class WriteControllerToken; friend class StopWriteToken; friend class DelayWriteToken; + friend class CompactionPressureToken; int total_stopped_; int total_delayed_; + int total_compaction_pressure_; uint64_t bytes_left_; uint64_t last_refill_time_; uint64_t delayed_write_rate_; @@ -94,4 +105,11 @@ class DelayWriteToken : public WriteControllerToken { virtual ~DelayWriteToken(); }; +class CompactionPressureToken : public WriteControllerToken { + public: + explicit CompactionPressureToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~CompactionPressureToken(); +}; + } // namespace rocksdb diff --git a/external/rocksdb/db/write_controller_test.cc b/external/rocksdb/db/write_controller_test.cc index aa8175d65b..db9a9db1b3 100644 --- a/external/rocksdb/db/write_controller_test.cc +++ b/external/rocksdb/db/write_controller_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,28 @@ class TimeSetEnv : public EnvWrapper { virtual uint64_t NowMicros() override { return now_micros_; } }; +TEST_F(WriteControllerTest, ChangeDelayRateTest) { + TimeSetEnv env; + WriteController controller(10000000u); + auto delay_token_0 = + controller.GetDelayToken(controller.delayed_write_rate()); + ASSERT_EQ(static_cast(2000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_1 = controller.GetDelayToken(2000000u); + ASSERT_EQ(static_cast(10000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_2 = controller.GetDelayToken(1000000u); + ASSERT_EQ(static_cast(20000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_3 = controller.GetDelayToken(20000000u); + ASSERT_EQ(static_cast(1000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_4 = + controller.GetDelayToken(controller.delayed_write_rate() * 2); + ASSERT_EQ(static_cast(500000), + controller.GetDelay(&env, 20000000u)); +} + TEST_F(WriteControllerTest, SanityTest) { WriteController controller(10000000u); auto stop_token_1 = controller.GetStopToken(); @@ -32,12 +54,19 @@ TEST_F(WriteControllerTest, SanityTest) { TimeSetEnv env; - auto delay_token_1 = controller.GetDelayToken(); + auto delay_token_1 = controller.GetDelayToken(10000000u); + ASSERT_EQ(static_cast(2000000), + controller.GetDelay(&env, 20000000u)); + + env.now_micros_ += 1999900u; // sleep debt 1000 + + auto delay_token_2 = controller.GetDelayToken(10000000u); + // Rate reset after changing the token. ASSERT_EQ(static_cast(2000000), controller.GetDelay(&env, 20000000u)); env.now_micros_ += 1999900u; // sleep debt 1000 - auto delay_token_2 = controller.GetDelayToken(); + // One refill: 10240 bytes allowed, 1000 used, 9240 left ASSERT_EQ(static_cast(1124), controller.GetDelay(&env, 1000u)); env.now_micros_ += 1124u; // sleep debt 0 diff --git a/external/rocksdb/db/write_thread.cc b/external/rocksdb/db/write_thread.cc index 9b66af240c..531da55df3 100644 --- a/external/rocksdb/db/write_thread.cc +++ b/external/rocksdb/db/write_thread.cc @@ -1,36 +1,200 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #include "db/write_thread.h" +#include +#include +#include +#include "db/column_family.h" +#include "port/port.h" +#include "util/sync_point.h" namespace rocksdb { -void WriteThread::Await(Writer* w) { - std::unique_lock guard(w->JoinMutex()); - w->JoinCV().wait(guard, [w] { return w->joined; }); +WriteThread::WriteThread(uint64_t max_yield_usec, uint64_t slow_yield_usec) + : max_yield_usec_(max_yield_usec), + slow_yield_usec_(slow_yield_usec), + newest_writer_(nullptr) {} + +uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) { + // We're going to block. Lazily create the mutex. We guarantee + // propagation of this construction to the waker via the + // STATE_LOCKED_WAITING state. The waker won't try to touch the mutex + // or the condvar unless they CAS away the STATE_LOCKED_WAITING that + // we install below. + w->CreateMutex(); + + auto state = w->state.load(std::memory_order_acquire); + assert(state != STATE_LOCKED_WAITING); + if ((state & goal_mask) == 0 && + w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) { + // we have permission (and an obligation) to use StateMutex + std::unique_lock guard(w->StateMutex()); + w->StateCV().wait(guard, [w] { + return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING; + }); + state = w->state.load(std::memory_order_relaxed); + } + // else tricky. Goal is met or CAS failed. In the latter case the waker + // must have changed the state, and compare_exchange_strong has updated + // our local variable with the new one. At the moment WriteThread never + // waits for a transition across intermediate states, so we know that + // since a state change has occurred the goal must have been met. + assert((state & goal_mask) != 0); + return state; } -void WriteThread::MarkJoined(Writer* w) { - std::lock_guard guard(w->JoinMutex()); - assert(!w->joined); - w->joined = true; - w->JoinCV().notify_one(); +uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask, + AdaptationContext* ctx) { + uint8_t state; + + // On a modern Xeon each loop takes about 7 nanoseconds (most of which + // is the effect of the pause instruction), so 200 iterations is a bit + // more than a microsecond. This is long enough that waits longer than + // this can amortize the cost of accessing the clock and yielding. + for (uint32_t tries = 0; tries < 200; ++tries) { + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + return state; + } + port::AsmVolatilePause(); + } + + // If we're only going to end up waiting a short period of time, + // it can be a lot more efficient to call std::this_thread::yield() + // in a loop than to block in StateMutex(). For reference, on my 4.0 + // SELinux test server with support for syscall auditing enabled, the + // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is + // 2.7 usec, and the average is more like 10 usec. That can be a big + // drag on RockDB's single-writer design. Of course, spinning is a + // bad idea if other threads are waiting to run or if we're going to + // wait for a long time. How do we decide? + // + // We break waiting into 3 categories: short-uncontended, + // short-contended, and long. If we had an oracle, then we would always + // spin for short-uncontended, always block for long, and our choice for + // short-contended might depend on whether we were trying to optimize + // RocksDB throughput or avoid being greedy with system resources. + // + // Bucketing into short or long is easy by measuring elapsed time. + // Differentiating short-uncontended from short-contended is a bit + // trickier, but not too bad. We could look for involuntary context + // switches using getrusage(RUSAGE_THREAD, ..), but it's less work + // (portability code and CPU) to just look for yield calls that take + // longer than we expect. sched_yield() doesn't actually result in any + // context switch overhead if there are no other runnable processes + // on the current core, in which case it usually takes less than + // a microsecond. + // + // There are two primary tunables here: the threshold between "short" + // and "long" waits, and the threshold at which we suspect that a yield + // is slow enough to indicate we should probably block. If these + // thresholds are chosen well then CPU-bound workloads that don't + // have more threads than cores will experience few context switches + // (voluntary or involuntary), and the total number of context switches + // (voluntary and involuntary) will not be dramatically larger (maybe + // 2x) than the number of voluntary context switches that occur when + // --max_yield_wait_micros=0. + // + // There's another constant, which is the number of slow yields we will + // tolerate before reversing our previous decision. Solitary slow + // yields are pretty common (low-priority small jobs ready to run), + // so this should be at least 2. We set this conservatively to 3 so + // that we can also immediately schedule a ctx adaptation, rather than + // waiting for the next update_ctx. + + const size_t kMaxSlowYieldsWhileSpinning = 3; + + bool update_ctx = false; + bool would_spin_again = false; + + if (max_yield_usec_ > 0) { + update_ctx = Random::GetTLSInstance()->OneIn(256); + + if (update_ctx || ctx->value.load(std::memory_order_relaxed) >= 0) { + // we're updating the adaptation statistics, or spinning has > + // 50% chance of being shorter than max_yield_usec_ and causing no + // involuntary context switches + auto spin_begin = std::chrono::steady_clock::now(); + + // this variable doesn't include the final yield (if any) that + // causes the goal to be met + size_t slow_yield_count = 0; + + auto iter_begin = spin_begin; + while ((iter_begin - spin_begin) <= + std::chrono::microseconds(max_yield_usec_)) { + std::this_thread::yield(); + + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + // success + would_spin_again = true; + break; + } + + auto now = std::chrono::steady_clock::now(); + if (now == iter_begin || + now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) { + // conservatively count it as a slow yield if our clock isn't + // accurate enough to measure the yield duration + ++slow_yield_count; + if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) { + // Not just one ivcsw, but several. Immediately update ctx + // and fall back to blocking + update_ctx = true; + break; + } + } + iter_begin = now; + } + } + } + + if ((state & goal_mask) == 0) { + state = BlockingAwaitState(w, goal_mask); + } + + if (update_ctx) { + auto v = ctx->value.load(std::memory_order_relaxed); + // fixed point exponential decay with decay constant 1/1024, with +1 + // and -1 scaled to avoid overflow for int32_t + v = v + (v / 1024) + (would_spin_again ? 1 : -1) * 16384; + ctx->value.store(v, std::memory_order_relaxed); + } + + assert((state & goal_mask) != 0); + return state; } -void WriteThread::LinkOne(Writer* w, bool* wait_needed) { - assert(!w->joined && !w->done); +void WriteThread::SetState(Writer* w, uint8_t new_state) { + auto state = w->state.load(std::memory_order_acquire); + if (state == STATE_LOCKED_WAITING || + !w->state.compare_exchange_strong(state, new_state)) { + assert(state == STATE_LOCKED_WAITING); + + std::lock_guard guard(w->StateMutex()); + assert(w->state.load(std::memory_order_relaxed) != new_state); + w->state.store(new_state, std::memory_order_relaxed); + w->StateCV().notify_one(); + } +} + +void WriteThread::LinkOne(Writer* w, bool* linked_as_leader) { + assert(w->state == STATE_INIT); Writer* writers = newest_writer_.load(std::memory_order_relaxed); while (true) { w->link_older = writers; - if (writers != nullptr) { - w->CreateMutex(); - } if (newest_writer_.compare_exchange_strong(writers, w)) { - // Success. - *wait_needed = (writers != nullptr); + if (writers == nullptr) { + // this isn't part of the WriteThread machinery, but helps with + // debugging and is checked by an assert in WriteImpl + w->state.store(STATE_GROUP_LEADER, std::memory_order_relaxed); + } + *linked_as_leader = (writers == nullptr); return; } } @@ -49,22 +213,30 @@ void WriteThread::CreateMissingNewerLinks(Writer* head) { } void WriteThread::JoinBatchGroup(Writer* w) { + static AdaptationContext ctx("JoinBatchGroup"); + assert(w->batch != nullptr); - bool wait_needed; - LinkOne(w, &wait_needed); - if (wait_needed) { - Await(w); + bool linked_as_leader; + LinkOne(w, &linked_as_leader); + + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w); + + if (!linked_as_leader) { + AwaitState(w, + STATE_GROUP_LEADER | STATE_PARALLEL_FOLLOWER | STATE_COMPLETED, + &ctx); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); } } size_t WriteThread::EnterAsBatchGroupLeader( Writer* leader, WriteThread::Writer** last_writer, - autovector* write_batch_group) { + autovector* write_batch_group) { assert(leader->link_older == nullptr); assert(leader->batch != nullptr); size_t size = WriteBatchInternal::ByteSize(leader->batch); - write_batch_group->push_back(leader->batch); + write_batch_group->push_back(leader); // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow @@ -76,18 +248,12 @@ size_t WriteThread::EnterAsBatchGroupLeader( *last_writer = leader; - if (leader->has_callback) { - // TODO(agiardullo:) Batching not currently supported as this write may - // fail if the callback function decides to abort this write. - return size; - } - Writer* newest_writer = newest_writer_.load(std::memory_order_acquire); // This is safe regardless of any db mutex status of the caller. Previous // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks // (they emptied the list and then we added ourself as leader) or had to - // explicitly wake up us (the list was non-empty when we added ourself, + // explicitly wake us up (the list was non-empty when we added ourself, // so we have already received our MarkJoined). CreateMissingNewerLinks(newest_writer); @@ -108,31 +274,102 @@ size_t WriteThread::EnterAsBatchGroupLeader( break; } - if (w->has_callback) { - // Do not include writes which may be aborted if the callback does not - // succeed. - break; - } - if (w->batch == nullptr) { // Do not include those writes with nullptr batch. Those are not writes, // those are something else. They want to be alone break; } - size += WriteBatchInternal::ByteSize(w->batch); - if (size > max_size) { + if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { + // dont batch writes that don't want to be batched + break; + } + + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { // Do not make batch too big break; } - write_batch_group->push_back(w->batch); + size += batch_size; + write_batch_group->push_back(w); w->in_batch_group = true; *last_writer = w; } return size; } +void WriteThread::LaunchParallelFollowers(ParallelGroup* pg, + SequenceNumber sequence) { + // EnterAsBatchGroupLeader already created the links from leader to + // newer writers in the group + + pg->leader->parallel_group = pg; + + Writer* w = pg->leader; + w->sequence = sequence; + + while (w != pg->last_writer) { + // Writers that won't write don't get sequence allotment + if (!w->CallbackFailed()) { + sequence += WriteBatchInternal::Count(w->batch); + } + w = w->link_newer; + + w->sequence = sequence; + w->parallel_group = pg; + SetState(w, STATE_PARALLEL_FOLLOWER); + } +} + +bool WriteThread::CompleteParallelWorker(Writer* w) { + static AdaptationContext ctx("CompleteParallelWorker"); + + auto* pg = w->parallel_group; + if (!w->status.ok()) { + std::lock_guard guard(w->StateMutex()); + pg->status = w->status; + } + + auto leader = pg->leader; + auto early_exit_allowed = pg->early_exit_allowed; + + if (pg->running.load(std::memory_order_acquire) > 1 && pg->running-- > 1) { + // we're not the last one + AwaitState(w, STATE_COMPLETED, &ctx); + + // Caller only needs to perform exit duties if early exit doesn't + // apply and this is the leader. Can't touch pg here. Whoever set + // our state to STATE_COMPLETED copied pg->status to w.status for us. + return w == leader && !(early_exit_allowed && w->status.ok()); + } + // else we're the last parallel worker + + if (w == leader || (early_exit_allowed && pg->status.ok())) { + // this thread should perform exit duties + w->status = pg->status; + return true; + } else { + // We're the last parallel follower but early commit is not + // applicable. Wake up the leader and then wait for it to exit. + assert(w->state == STATE_PARALLEL_FOLLOWER); + SetState(leader, STATE_COMPLETED); + AwaitState(w, STATE_COMPLETED, &ctx); + return false; + } +} + +void WriteThread::EarlyExitParallelGroup(Writer* w) { + auto* pg = w->parallel_group; + + assert(w->state == STATE_PARALLEL_FOLLOWER); + assert(pg->status.ok()); + ExitAsBatchGroupLeader(pg->leader, pg->last_writer, pg->status); + assert(w->status.ok()); + assert(w->state == STATE_COMPLETED); + SetState(pg->leader, STATE_COMPLETED); +} + void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer, Status status) { assert(leader->link_older == nullptr); @@ -164,30 +401,33 @@ void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer, // nullptr when they enqueued (we were definitely enqueued before them // and are still in the list). That means leader handoff occurs when // we call MarkJoined - MarkJoined(last_writer->link_newer); + SetState(last_writer->link_newer, STATE_GROUP_LEADER); } // else nobody else was waiting, although there might already be a new // leader now while (last_writer != leader) { last_writer->status = status; - last_writer->done = true; - // We must read link_older before calling MarkJoined, because as - // soon as it is marked the other thread's AwaitJoined may return - // and deallocate the Writer. + // we need to read link_older before calling SetState, because as soon + // as it is marked committed the other thread's Await may return and + // deallocate the Writer. auto next = last_writer->link_older; - MarkJoined(last_writer); + SetState(last_writer, STATE_COMPLETED); + last_writer = next; } } void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) { + static AdaptationContext ctx("EnterUnbatched"); + assert(w->batch == nullptr); - bool wait_needed; - LinkOne(w, &wait_needed); - if (wait_needed) { + bool linked_as_leader; + LinkOne(w, &linked_as_leader); + if (!linked_as_leader) { mu->Unlock(); - Await(w); + TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait"); + AwaitState(w, STATE_GROUP_LEADER, &ctx); mu->Lock(); } } diff --git a/external/rocksdb/db/write_thread.h b/external/rocksdb/db/write_thread.h index 3a15ea847b..87ffff6f95 100644 --- a/external/rocksdb/db/write_thread.h +++ b/external/rocksdb/db/write_thread.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,11 +8,16 @@ #include #include #include +#include #include #include #include +#include + +#include "db/write_callback.h" #include "rocksdb/status.h" -#include "db/write_batch_internal.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" #include "util/autovector.h" #include "util/instrumented_mutex.h" @@ -20,19 +25,74 @@ namespace rocksdb { class WriteThread { public: + enum State : uint8_t { + // The initial state of a writer. This is a Writer that is + // waiting in JoinBatchGroup. This state can be left when another + // thread informs the waiter that it has become a group leader + // (-> STATE_GROUP_LEADER), when a leader that has chosen to be + // non-parallel informs a follower that its writes have been committed + // (-> STATE_COMPLETED), or when a leader that has chosen to perform + // updates in parallel and needs this Writer to apply its batch (-> + // STATE_PARALLEL_FOLLOWER). + STATE_INIT = 1, + + // The state used to inform a waiting Writer that it has become the + // leader, and it should now build a write batch group. Tricky: + // this state is not used if newest_writer_ is empty when a writer + // enqueues itself, because there is no need to wait (or even to + // create the mutex and condvar used to wait) in that case. This is + // a terminal state unless the leader chooses to make this a parallel + // batch, in which case the last parallel worker to finish will move + // the leader to STATE_COMPLETED. + STATE_GROUP_LEADER = 2, + + // A Writer that has returned as a follower in a parallel group. + // It should apply its batch to the memtable and then call + // CompleteParallelWorker. When someone calls ExitAsBatchGroupLeader + // or EarlyExitParallelGroup this state will get transitioned to + // STATE_COMPLETED. + STATE_PARALLEL_FOLLOWER = 4, + + // A follower whose writes have been applied, or a parallel leader + // whose followers have all finished their work. This is a terminal + // state. + STATE_COMPLETED = 8, + + // A state indicating that the thread may be waiting using StateMutex() + // and StateCondVar() + STATE_LOCKED_WAITING = 16, + }; + + struct Writer; + + struct ParallelGroup { + Writer* leader; + Writer* last_writer; + SequenceNumber last_sequence; + bool early_exit_allowed; + // before running goes to zero, status needs leader->StateMutex() + Status status; + std::atomic running; + }; + // Information kept for every waiting writer. struct Writer { WriteBatch* batch; bool sync; bool disableWAL; + bool disable_memtable; + uint64_t log_used; // log number that this batch was inserted into + uint64_t log_ref; // log number that memtable insert should reference bool in_batch_group; - bool done; - bool has_callback; - Status status; - bool made_waitable; // records lazy construction of mutex and cv - bool joined; // read/write only under JoinMutex() (or pre-link) - std::aligned_storage::type join_mutex_bytes; - std::aligned_storage::type join_cv_bytes; + WriteCallback* callback; + bool made_waitable; // records lazy construction of mutex and cv + std::atomic state; // write under StateMutex() or pre-link + ParallelGroup* parallel_group; + SequenceNumber sequence; // the sequence number to use + Status status; // status of memtable inserter + Status callback_status; // status returned by callback->Callback() + std::aligned_storage::type state_mutex_bytes; + std::aligned_storage::type state_cv_bytes; Writer* link_older; // read/write only before linking, or as leader Writer* link_newer; // lazy, read/write only before linking, or as leader @@ -40,45 +100,87 @@ class WriteThread { : batch(nullptr), sync(false), disableWAL(false), + disable_memtable(false), + log_used(0), + log_ref(0), in_batch_group(false), - done(false), - has_callback(false), + callback(nullptr), made_waitable(false), - joined(false), + state(STATE_INIT), + parallel_group(nullptr), link_older(nullptr), link_newer(nullptr) {} ~Writer() { if (made_waitable) { - JoinMutex().~mutex(); - JoinCV().~condition_variable(); + StateMutex().~mutex(); + StateCV().~condition_variable(); + } + } + + bool CheckCallback(DB* db) { + if (callback != nullptr) { + callback_status = callback->Callback(db); } + return callback_status.ok(); } void CreateMutex() { - assert(!joined); if (!made_waitable) { + // Note that made_waitable is tracked separately from state + // transitions, because we can't atomically create the mutex and + // link into the list. made_waitable = true; - new (&join_mutex_bytes) std::mutex; - new (&join_cv_bytes) std::condition_variable; + new (&state_mutex_bytes) std::mutex; + new (&state_cv_bytes) std::condition_variable; + } + } + + // returns the aggregate status of this Writer + Status FinalStatus() { + if (!status.ok()) { + // a non-ok memtable write status takes presidence + assert(callback == nullptr || callback_status.ok()); + return status; + } else if (!callback_status.ok()) { + // if the callback failed then that is the status we want + // because a memtable insert should not have been attempted + assert(callback != nullptr); + assert(status.ok()); + return callback_status; + } else { + // if there is no callback then we only care about + // the memtable insert status + assert(callback == nullptr || callback_status.ok()); + return status; } } - // No other mutexes may be acquired while holding JoinMutex(), it is + bool CallbackFailed() { + return (callback != nullptr) && !callback_status.ok(); + } + + bool ShouldWriteToMemtable() { + return !CallbackFailed() && !disable_memtable; + } + + bool ShouldWriteToWAL() { return !CallbackFailed() && !disableWAL; } + + // No other mutexes may be acquired while holding StateMutex(), it is // always last in the order - std::mutex& JoinMutex() { + std::mutex& StateMutex() { assert(made_waitable); - return *static_cast(static_cast(&join_mutex_bytes)); + return *static_cast(static_cast(&state_mutex_bytes)); } - std::condition_variable& JoinCV() { + std::condition_variable& StateCV() { assert(made_waitable); return *static_cast( - static_cast(&join_cv_bytes)); + static_cast(&state_cv_bytes)); } }; - WriteThread() : newest_writer_(nullptr) {} + WriteThread(uint64_t max_yield_usec, uint64_t slow_yield_usec); // IMPORTANT: None of the methods in this class rely on the db mutex // for correctness. All of the methods except JoinBatchGroup and @@ -86,13 +188,16 @@ class WriteThread { // Correctness is maintained by ensuring that only a single thread is // a leader at a time. - // Registers w as ready to become part of a batch group, and blocks - // until some other thread has completed the write (in which case - // w->done will be set to true) or this write has become the leader - // of a batch group (w->done will remain unset). The db mutex SHOULD - // NOT be held when calling this function, because it will block. - // If !w->done then JoinBatchGroup should be followed by a call to - // EnterAsBatchGroupLeader and ExitAsBatchGroupLeader. + // Registers w as ready to become part of a batch group, waits until the + // caller should perform some work, and returns the current state of the + // writer. If w has become the leader of a write batch group, returns + // STATE_GROUP_LEADER. If w has been made part of a sequential batch + // group and the leader has performed the write, returns STATE_DONE. + // If w has been made part of a parallel batch group and is responsible + // for updating the memtable, returns STATE_PARALLEL_FOLLOWER. + // + // The db mutex SHOULD NOT be held when calling this function, because + // it will block. // // Writer* w: Writer to be executed as part of a batch group void JoinBatchGroup(Writer* w); @@ -100,15 +205,36 @@ class WriteThread { // Constructs a write batch group led by leader, which should be a // Writer passed to JoinBatchGroup on the current thread. // - // Writer* leader: Writer passed to JoinBatchGroup, but !done - // Writer** last_writer: Out-param for use by ExitAsBatchGroupLeader + // Writer* leader: Writer that is STATE_GROUP_LEADER + // Writer** last_writer: Out-param that identifies the last follower // autovector* write_batch_group: Out-param of group members - // returns: Total batch group size - size_t EnterAsBatchGroupLeader(Writer* leader, Writer** last_writer, - autovector* write_batch_group); + // returns: Total batch group byte size + size_t EnterAsBatchGroupLeader( + Writer* leader, Writer** last_writer, + autovector* write_batch_group); - // Unlinks the Writer-s in a batch group, wakes up the non-leaders, and - // wakes up the next leader (if any). + // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the + // non-leader members of this write batch group. Sets Writer::sequence + // before waking them up. + // + // ParallalGroup* pg: Extra state used to coordinate the parallel add + // SequenceNumber sequence: Starting sequence number to assign to Writer-s + void LaunchParallelFollowers(ParallelGroup* pg, SequenceNumber sequence); + + // Reports the completion of w's batch to the parallel group leader, and + // waits for the rest of the parallel batch to complete. Returns true + // if this thread is the last to complete, and hence should advance + // the sequence number and then call EarlyExitParallelGroup, false if + // someone else has already taken responsibility for that. + bool CompleteParallelWorker(Writer* w); + + // This method performs an early completion of a parallel write group, + // where the cleanup work of the leader is performed by a follower who + // happens to be the last parallel worker to complete. + void EarlyExitParallelGroup(Writer* w); + + // Unlinks the Writer-s in a batch group, wakes up the non-leaders, + // and wakes up the next leader (if any). // // Writer* leader: From EnterAsBatchGroupLeader // Writer* last_writer: Value of out-param of EnterAsBatchGroupLeader @@ -128,18 +254,37 @@ class WriteThread { // writers. void ExitUnbatched(Writer* w); + struct AdaptationContext { + const char* name; + std::atomic value; + + explicit AdaptationContext(const char* name0) : name(name0), value(0) {} + }; + private: + uint64_t max_yield_usec_; + uint64_t slow_yield_usec_; + // Points to the newest pending Writer. Only leader can remove // elements, adding can be done lock-free by anybody std::atomic newest_writer_; - void Await(Writer* w); - void MarkJoined(Writer* w); + // Waits for w->state & goal_mask using w->StateMutex(). Returns + // the state that satisfies goal_mask. + uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); + + // Blocks until w->state & goal_mask, returning the state value + // that satisfied the predicate. Uses ctx to adaptively use + // std::this_thread::yield() to avoid mutex overheads. ctx should be + // a context-dependent static. + uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx); + + void SetState(Writer* w, uint8_t new_state); - // Links w into the newest_writer_ list. Sets *wait_needed to false - // if w was linked directly into the leader position, true otherwise. - // Safe to call from multiple threads without external locking. - void LinkOne(Writer* w, bool* wait_needed); + // Links w into the newest_writer_ list. Sets *linked_as_leader to + // true if w was linked directly into the leader position. Safe to + // call from multiple threads without external locking. + void LinkOne(Writer* w, bool* linked_as_leader); // Computes any missing link_newer links. Should not be called // concurrently with itself. diff --git a/external/rocksdb/db/writebuffer.h b/external/rocksdb/db/writebuffer.h deleted file mode 100644 index 7047a92440..0000000000 --- a/external/rocksdb/db/writebuffer.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBuffer is for managing memory allocation for one or more MemTables. - -#pragma once - -namespace rocksdb { - -class WriteBuffer { - public: - explicit WriteBuffer(size_t _buffer_size) - : buffer_size_(_buffer_size), memory_used_(0) {} - - ~WriteBuffer() {} - - size_t memory_usage() const { return memory_used_; } - size_t buffer_size() const { return buffer_size_; } - - // Should only be called from write thread - bool ShouldFlush() const { - return buffer_size() > 0 && memory_usage() >= buffer_size(); - } - - // Should only be called from write thread - void ReserveMem(size_t mem) { memory_used_ += mem; } - void FreeMem(size_t mem) { memory_used_ -= mem; } - - private: - const size_t buffer_size_; - size_t memory_used_; - - // No copying allowed - WriteBuffer(const WriteBuffer&); - void operator=(const WriteBuffer&); -}; - -} // namespace rocksdb diff --git a/external/rocksdb/db/xfunc_test_points.cc b/external/rocksdb/db/xfunc_test_points.cc new file mode 100644 index 0000000000..67e96dd059 --- /dev/null +++ b/external/rocksdb/db/xfunc_test_points.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/xfunc_test_points.h" +#include "util/xfunc.h" + +namespace rocksdb { + +#ifdef XFUNC + +void xf_manage_release(ManagedIterator* iter) { + if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) { + iter->ReleaseIter(false); + } +} + +void xf_manage_create(ManagedIterator* iter) { iter->SetDropOld(false); } + +void xf_manage_new(DBImpl* db, ReadOptions* read_options, + bool is_snapshot_supported) { + if ((!XFuncPoint::Check("managed_xftest_dropold") && + (!XFuncPoint::Check("managed_xftest_release"))) || + (!read_options->managed)) { + return; + } + if ((!read_options->tailing) && (read_options->snapshot == nullptr) && + (!is_snapshot_supported)) { + read_options->managed = false; + return; + } + if (db->GetOptions().prefix_extractor != nullptr) { + if (strcmp(db->GetOptions().table_factory.get()->Name(), "PlainTable")) { + if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) { + read_options->total_order_seek = true; + } + } else { + read_options->managed = false; + } + } +} + +class XFTransactionWriteHandler : public WriteBatch::Handler { + public: + Transaction* txn_; + DBImpl* db_impl_; + + XFTransactionWriteHandler(Transaction* txn, DBImpl* db_impl) + : txn_(txn), db_impl_(db_impl) {} + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + InstrumentedMutexLock l(&db_impl_->mutex_); + + ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); + if (cfh == nullptr) { + return Status::InvalidArgument( + "XFUNC test could not find column family " + "handle for id ", + ToString(column_family_id)); + } + + txn_->Put(cfh, key, value); + + return Status::OK(); + } + + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + InstrumentedMutexLock l(&db_impl_->mutex_); + + ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); + if (cfh == nullptr) { + return Status::InvalidArgument( + "XFUNC test could not find column family " + "handle for id ", + ToString(column_family_id)); + } + + txn_->Merge(cfh, key, value); + + return Status::OK(); + } + + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + InstrumentedMutexLock l(&db_impl_->mutex_); + + ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); + if (cfh == nullptr) { + return Status::InvalidArgument( + "XFUNC test could not find column family " + "handle for id ", + ToString(column_family_id)); + } + + txn_->Delete(cfh, key); + + return Status::OK(); + } + + virtual void LogData(const Slice& blob) override { txn_->PutLogData(blob); } +}; + +// Whenever DBImpl::Write is called, create a transaction and do the write via +// the transaction. +void xf_transaction_write(const WriteOptions& write_options, + const DBOptions& db_options, WriteBatch* my_batch, + WriteCallback* callback, DBImpl* db_impl, Status* s, + bool* write_attempted) { + if (callback != nullptr) { + // We may already be in a transaction, don't force a transaction + *write_attempted = false; + return; + } + + OptimisticTransactionDB* txn_db = new OptimisticTransactionDB(db_impl); + Transaction* txn = Transaction::BeginTransaction(txn_db, write_options); + + XFTransactionWriteHandler handler(txn, db_impl); + *s = my_batch->Iterate(&handler); + + if (!s->ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log, + "XFUNC test could not iterate batch. status: $s\n", + s->ToString().c_str()); + } + + *s = txn->Commit(); + + if (!s->ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log, + "XFUNC test could not commit transaction. status: $s\n", + s->ToString().c_str()); + } + + *write_attempted = true; + delete txn; + delete txn_db; +} + +#endif // XFUNC + +} // namespace rocksdb diff --git a/external/rocksdb/db/xfunc_test_points.h b/external/rocksdb/db/xfunc_test_points.h new file mode 100644 index 0000000000..8ed9f2c736 --- /dev/null +++ b/external/rocksdb/db/xfunc_test_points.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "db/db_impl.h" +#include "db/managed_iterator.h" +#include "db/write_callback.h" +#include "rocksdb/options.h" +#include "rocksdb/write_batch.h" +#include "util/xfunc.h" + +namespace rocksdb { + +#ifdef XFUNC + +// DB-specific test points for the cross-functional test framework (see +// util/xfunc.h). +void xf_manage_release(ManagedIterator* iter); +void xf_manage_create(ManagedIterator* iter); +void xf_manage_new(DBImpl* db, ReadOptions* readoptions, + bool is_snapshot_supported); +void xf_transaction_write(const WriteOptions& write_options, + const DBOptions& db_options, + class WriteBatch* my_batch, + class WriteCallback* callback, DBImpl* db_impl, + Status* success, bool* write_attempted); + +#endif // XFUNC + +} // namespace rocksdb diff --git a/external/rocksdb/doc/log_format.txt b/external/rocksdb/doc/log_format.txt index 3a0414b65a..5222d81131 100644 --- a/external/rocksdb/doc/log_format.txt +++ b/external/rocksdb/doc/log_format.txt @@ -11,7 +11,7 @@ Each block consists of a sequence of records: A record never starts within the last six bytes of a block (since it won't fit). Any leftover bytes here form the trailer, which must -consist entirely of zero bytes and must be skipped by readers. +consist entirely of zero bytes and must be skipped by readers. Aside: if exactly seven bytes are left in the current block, and a new non-zero length record is added, the writer must emit a FIRST record diff --git a/external/rocksdb/examples/Makefile b/external/rocksdb/examples/Makefile index fe82d11cd2..b04f9a8fba 100644 --- a/external/rocksdb/examples/Makefile +++ b/external/rocksdb/examples/Makefile @@ -1,5 +1,13 @@ include ../make_config.mk +ifndef DISABLE_JEMALLOC + ifdef JEMALLOC + PLATFORM_CXXFLAGS += "-DROCKSDB_JEMALLOC" + endif + EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS) -lpthread + PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) +endif + .PHONY: clean librocksdb all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example @@ -28,8 +36,11 @@ optimistic_transaction_example: librocksdb optimistic_transaction_example.cc transaction_example: librocksdb transaction_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +options_file_example: librocksdb options_file_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + clean: rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example librocksdb: - cd .. && $(MAKE) librocksdb.a + cd .. && $(MAKE) static_lib diff --git a/external/rocksdb/examples/README.md b/external/rocksdb/examples/README.md index b07b3903a6..f4ba2384b8 100644 --- a/external/rocksdb/examples/README.md +++ b/external/rocksdb/examples/README.md @@ -1 +1,2 @@ -Compile RocksDB first by executing `make static_lib` in parent dir +1. Compile RocksDB first by executing `make static_lib` in parent dir +2. Compile all examples: `cd examples/; make all` diff --git a/external/rocksdb/examples/column_families_example.cc b/external/rocksdb/examples/column_families_example.cc index 3ffac064d8..f2dec691ea 100644 --- a/external/rocksdb/examples/column_families_example.cc +++ b/external/rocksdb/examples/column_families_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/examples/compact_files_example.cc b/external/rocksdb/examples/compact_files_example.cc index 6c0456675c..023ae403b7 100644 --- a/external/rocksdb/examples/compact_files_example.cc +++ b/external/rocksdb/examples/compact_files_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/examples/compaction_filter_example.cc b/external/rocksdb/examples/compaction_filter_example.cc index 050f4611a9..77dbd9af76 100644 --- a/external/rocksdb/examples/compaction_filter_example.cc +++ b/external/rocksdb/examples/compaction_filter_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,19 +10,18 @@ class MyMerge : public rocksdb::MergeOperator { public: - bool FullMerge(const rocksdb::Slice& key, - const rocksdb::Slice* existing_value, - const std::deque& operand_list, - std::string* new_value, - rocksdb::Logger* logger) const override { - new_value->clear(); - if (existing_value != nullptr) { - new_value->assign(existing_value->data(), existing_value->size()); + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + merge_out->new_value.clear(); + if (merge_in.existing_value != nullptr) { + merge_out->new_value.assign(merge_in.existing_value->data(), + merge_in.existing_value->size()); } - for (const std::string& m : operand_list) { - fprintf(stderr, "Merge(%s)\n", m.c_str()); - assert(m != "bad"); // the compaction filter filters out bad values - new_value->assign(m); + for (const rocksdb::Slice& m : merge_in.operand_list) { + fprintf(stderr, "Merge(%s)\n", m.ToString().c_str()); + // the compaction filter filters out bad values + assert(m.ToString() != "bad"); + merge_out->new_value.assign(m.data(), m.size()); } return true; } diff --git a/external/rocksdb/examples/optimistic_transaction_example.cc b/external/rocksdb/examples/optimistic_transaction_example.cc index e9ab0e5ee2..d28a305b34 100644 --- a/external/rocksdb/examples/optimistic_transaction_example.cc +++ b/external/rocksdb/examples/optimistic_transaction_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/examples/options_file_example.cc b/external/rocksdb/examples/options_file_example.cc new file mode 100644 index 0000000000..360ccddf23 --- /dev/null +++ b/external/rocksdb/examples/options_file_example.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file demonstrates how to use the utility functions defined in +// rocksdb/utilities/options_util.h to open a rocksdb database without +// remembering all the rocksdb options. +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/options_util.h" + +using namespace rocksdb; + +std::string kDBPath = "/tmp/rocksdb_options_file_example"; + +namespace { +// A dummy compaction filter +class DummyCompactionFilter : public CompactionFilter { + public: + virtual ~DummyCompactionFilter() {} + virtual bool Filter(int level, const Slice& key, const Slice& existing_value, + std::string* new_value, bool* value_changed) const { + return false; + } + virtual const char* Name() const { return "DummyCompactionFilter"; } +}; + +} // namespace + +int main() { + DBOptions db_opt; + db_opt.create_if_missing = true; + + std::vector cf_descs; + cf_descs.push_back({kDefaultColumnFamilyName, ColumnFamilyOptions()}); + cf_descs.push_back({"new_cf", ColumnFamilyOptions()}); + + // initialize BlockBasedTableOptions + auto cache = NewLRUCache(1 * 1024 * 1024 * 1024); + BlockBasedTableOptions bbt_opts; + bbt_opts.block_size = 32 * 1024; + bbt_opts.block_cache = cache; + + // initialize column families options + std::unique_ptr compaction_filter; + compaction_filter.reset(new DummyCompactionFilter()); + cf_descs[0].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts)); + cf_descs[0].options.compaction_filter = compaction_filter.get(); + cf_descs[1].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts)); + + // destroy and open DB + DB* db; + Status s = DestroyDB(kDBPath, Options(db_opt, cf_descs[0].options)); + assert(s.ok()); + s = DB::Open(Options(db_opt, cf_descs[0].options), kDBPath, &db); + assert(s.ok()); + + // Create column family, and rocksdb will persist the options. + ColumnFamilyHandle* cf; + s = db->CreateColumnFamily(ColumnFamilyOptions(), "new_cf", &cf); + assert(s.ok()); + + // close DB + delete cf; + delete db; + + // In the following code, we will reopen the rocksdb instance using + // the options file stored in the db directory. + + // Load the options file. + DBOptions loaded_db_opt; + std::vector loaded_cf_descs; + s = LoadLatestOptions(kDBPath, Env::Default(), &loaded_db_opt, + &loaded_cf_descs); + assert(s.ok()); + assert(loaded_db_opt.create_if_missing == db_opt.create_if_missing); + + // Initialize pointer options for each column family + for (size_t i = 0; i < loaded_cf_descs.size(); ++i) { + auto* loaded_bbt_opt = reinterpret_cast( + loaded_cf_descs[0].options.table_factory->GetOptions()); + // Expect the same as BlockBasedTableOptions will be loaded form file. + assert(loaded_bbt_opt->block_size == bbt_opts.block_size); + // However, block_cache needs to be manually initialized as documented + // in rocksdb/utilities/options_util.h. + loaded_bbt_opt->block_cache = cache; + } + // In addition, as pointer options are initialized with default value, + // we need to properly initialized all the pointer options if non-defalut + // values are used before calling DB::Open(). + assert(loaded_cf_descs[0].options.compaction_filter == nullptr); + loaded_cf_descs[0].options.compaction_filter = compaction_filter.get(); + + // reopen the db using the loaded options. + std::vector handles; + s = DB::Open(loaded_db_opt, kDBPath, loaded_cf_descs, &handles, &db); + assert(s.ok()); + + // close DB + for (auto* handle : handles) { + delete handle; + } + delete db; +} diff --git a/external/rocksdb/examples/rocksdb_option_file_example.ini b/external/rocksdb/examples/rocksdb_option_file_example.ini index ce74f77fd5..7dc0704298 100644 --- a/external/rocksdb/examples/rocksdb_option_file_example.ini +++ b/external/rocksdb/examples/rocksdb_option_file_example.ini @@ -1,9 +1,10 @@ # This is a RocksDB option file. # -# A typical RocksDB options file has three sections, which are -# Version, DBOptions, and more than one CFOptions. The RocksDB -# options file in general follows the basic INI file format -# with the following extensions / modifications: +# A typical RocksDB options file has four sections, which are +# Version section, DBOptions section, at least one CFOptions +# section, and one TableOptions section for each column family. +# The RocksDB options file in general follows the basic INI +# file format with the following extensions / modifications: # # * Escaped characters # We escaped the following characters: @@ -20,7 +21,7 @@ # Each statement contains a '=', where extra white-spaces # are supported. However, we don't support multi-lined statement. # Furthermore, each line can only contain at most one statement. -# * Section +# * Sections # Sections are of the form [SecitonTitle "SectionArgument"], # where section argument is optional. # * List @@ -29,25 +30,115 @@ # # Below is an example of a RocksDB options file: [Version] - # The Version section stores the version information about rocksdb - # and option file. This is used for handling potential format - # change in the future. - rocksdb_version=4.0.0 # We support "#" style comment. - options_file_version=1.0 + rocksdb_version=4.3.0 + options_file_version=1.1 + [DBOptions] - # Followed by the Version section is the DBOptions section. - # The value of an options can be assigned using a statement. - # Note that for those options that is not set in the options file, - # we will use the default value. - max_open_files=12345 - max_background_flushes=301 + stats_dump_period_sec=600 + max_manifest_file_size=18446744073709551615 + bytes_per_sync=8388608 + delayed_write_rate=2097152 + WAL_ttl_seconds=0 + WAL_size_limit_MB=0 + max_subcompactions=1 + wal_dir= + wal_bytes_per_sync=0 + db_write_buffer_size=0 + keep_log_file_num=1000 + table_cache_numshardbits=4 + max_file_opening_threads=1 + writable_file_max_buffer_size=1048576 + random_access_max_buffer_size=1048576 + use_fsync=false + max_total_wal_size=0 + max_open_files=-1 + skip_stats_update_on_db_open=false + max_background_compactions=16 + manifest_preallocation_size=4194304 + max_background_flushes=7 + is_fd_close_on_exec=true + max_log_file_size=0 + advise_random_on_open=true + create_missing_column_families=false + paranoid_checks=true + delete_obsolete_files_period_micros=21600000000 + disable_data_sync=false + log_file_time_to_roll=0 + compaction_readahead_size=0 + create_if_missing=false + use_adaptive_mutex=false + enable_thread_tracking=false + disableDataSync=false + allow_fallocate=true + error_if_exists=false + recycle_log_file_num=0 + skip_log_error_on_recovery=false + allow_mmap_reads=false + allow_os_buffer=true + db_log_dir= + new_table_reader_for_compaction_inputs=true + allow_mmap_writes=false + + [CFOptions "default"] - # ColumnFamilyOptions section must follow the format of - # [CFOptions "cf name"]. If a rocksdb instance - # has multiple column families, then its CFOptions must be - # specified in the same order as column family creation order. -[CFOptions "the second column family"] - # Each column family must have one section in the RocksDB option - # file even all the options of this column family are set to - # default value. -[CFOptions "the third column family"] + compaction_style=kCompactionStyleLevel + compaction_filter=nullptr + num_levels=6 + table_factory=BlockBasedTable + comparator=leveldb.BytewiseComparator + max_sequential_skip_in_iterations=8 + soft_rate_limit=0.000000 + max_bytes_for_level_base=1073741824 + memtable_prefix_bloom_probes=6 + memtable_prefix_bloom_bits=0 + memtable_prefix_bloom_huge_page_tlb_size=0 + max_successive_merges=0 + arena_block_size=16777216 + min_write_buffer_number_to_merge=1 + target_file_size_multiplier=1 + source_compaction_factor=1 + max_bytes_for_level_multiplier=8 + compaction_filter_factory=nullptr + max_write_buffer_number=8 + level0_stop_writes_trigger=20 + compression=kSnappyCompression + level0_file_num_compaction_trigger=4 + purge_redundant_kvs_while_flush=true + max_write_buffer_number_to_maintain=0 + memtable_factory=SkipListFactory + max_grandparent_overlap_factor=8 + expanded_compaction_factor=25 + hard_pending_compaction_bytes_limit=137438953472 + inplace_update_num_locks=10000 + level_compaction_dynamic_level_bytes=true + level0_slowdown_writes_trigger=12 + filter_deletes=false + verify_checksums_in_compaction=true + min_partial_merge_operands=2 + paranoid_file_checks=false + target_file_size_base=134217728 + optimize_filters_for_hits=false + merge_operator=PutOperator + compression_per_level=kNoCompression:kNoCompression:kNoCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression + compaction_measure_io_stats=false + prefix_extractor=nullptr + bloom_locality=0 + write_buffer_size=134217728 + disable_auto_compactions=false + inplace_update_support=false + +[TableOptions/BlockBasedTable "default"] + format_version=2 + whole_key_filtering=true + skip_table_builder_flush=false + no_block_cache=false + checksum=kCRC32c + filter_policy=rocksdb.BuiltinBloomFilter + block_size_deviation=10 + block_size=8192 + block_restart_interval=16 + cache_index_and_filter_blocks=false + pin_l0_filter_and_index_blocks_in_cache=false + index_type=kBinarySearch + hash_index_allow_collision=true + flush_block_policy_factory=FlushBlockBySizePolicyFactory diff --git a/external/rocksdb/examples/simple_example.cc b/external/rocksdb/examples/simple_example.cc index 28a7c9e8b1..453443479b 100644 --- a/external/rocksdb/examples/simple_example.cc +++ b/external/rocksdb/examples/simple_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/examples/transaction_example.cc b/external/rocksdb/examples/transaction_example.cc index a7d5061293..914f1bc304 100644 --- a/external/rocksdb/examples/transaction_example.cc +++ b/external/rocksdb/examples/transaction_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/hdfs/README b/external/rocksdb/hdfs/README index f4f1106e45..9036511692 100644 --- a/external/rocksdb/hdfs/README +++ b/external/rocksdb/hdfs/README @@ -10,10 +10,10 @@ The env_hdfs.h file defines the rocksdb objects that are needed to talk to an underlying filesystem. If you want to compile rocksdb with hdfs support, please set the following -enviroment variables appropriately (also defined in setup.sh for convenience) +environment variables appropriately (also defined in setup.sh for convenience) USE_HDFS=1 - JAVA_HOME=/usr/local/jdk-6u22-64 - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs + JAVA_HOME=/usr/local/jdk-7u79-64 + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-7u79-64/jre/lib/amd64/server:/usr/local/jdk-7u79-64/jre/lib/amd64/:./snappy/libs make clean all db_bench To run dbbench, diff --git a/external/rocksdb/hdfs/env_hdfs.h b/external/rocksdb/hdfs/env_hdfs.h index e1e9430934..ab27e0440e 100644 --- a/external/rocksdb/hdfs/env_hdfs.h +++ b/external/rocksdb/hdfs/env_hdfs.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -89,7 +89,9 @@ class HdfsEnv : public Env { virtual Status RenameFile(const std::string& src, const std::string& target); - virtual Status LinkFile(const std::string& src, const std::string& target); + virtual Status LinkFile(const std::string& src, const std::string& target) { + return Status::NotSupported(); // not supported + } virtual Status LockFile(const std::string& fname, FileLock** lock); @@ -99,8 +101,8 @@ class HdfsEnv : public Env { std::shared_ptr* result); virtual void Schedule(void (*function)(void* arg), void* arg, - Priority pri = LOW, void* tag = nullptr) { - posixEnv->Schedule(function, arg, pri, tag); + Priority pri = LOW, void* tag = nullptr, void (*unschedFunction)(void* arg) = 0) { + posixEnv->Schedule(function, arg, pri, tag, unschedFunction); } virtual int UnSchedule(void* tag, Priority pri) { @@ -322,7 +324,8 @@ class HdfsEnv : public Env { } virtual void Schedule(void (*function)(void* arg), void* arg, - Priority pri = LOW, void* tag = nullptr) override {} + Priority pri = LOW, void* tag = nullptr, + void (*unschedFunction)(void* arg) = 0) override {} virtual int UnSchedule(void* tag, Priority pri) override { return 0; } diff --git a/external/rocksdb/include/rocksdb/c.h b/external/rocksdb/include/rocksdb/c.h index 782d10b483..e269aa9b48 100644 --- a/external/rocksdb/include/rocksdb/c.h +++ b/external/rocksdb/include/rocksdb/c.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2013, Facebook, Inc. All rights reserved. +/* Copyright (c) 2011-present, Facebook, Inc. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. An additional grant of patent rights can be found in the PATENTS file in the same directory. @@ -121,6 +121,9 @@ extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups( + rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t* rocksdb_restore_options_create(); extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( @@ -266,6 +269,11 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf( rocksdb_t* db, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family); +extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators( + rocksdb_t *db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, char** errptr); + extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot( rocksdb_t* db); @@ -450,6 +458,12 @@ rocksdb_block_based_options_set_hash_index_allow_collision( extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_cache_index_and_filter_blocks( rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_skip_table_builder_flush( + rocksdb_block_based_table_options_t* options, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); @@ -490,6 +504,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter( rocksdb_options_t*, rocksdb_compactionfilter_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory( rocksdb_options_t*, rocksdb_compactionfilterfactory_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size( + rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator( rocksdb_options_t*, rocksdb_comparator_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator( @@ -515,12 +531,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size( + rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size( rocksdb_options_t* opt, uint64_t n); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options( - rocksdb_options_t*, int, int, int); + rocksdb_options_t*, int, int, int, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor( rocksdb_options_t*, rocksdb_slicetransform_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels( @@ -572,6 +590,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll( rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( + rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit( rocksdb_options_t*, double); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit( @@ -658,6 +678,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_bits( rocksdb_options_t*, uint32_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_probes(rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size( + rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges( rocksdb_options_t*, size_t); @@ -669,6 +691,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support( rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats( + rocksdb_options_t*, int); enum { rocksdb_no_compression = 0, @@ -755,6 +779,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy( extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom_full(int bits_per_key); /* Merge Operator */ @@ -793,6 +819,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size( + rocksdb_readoptions_t*, size_t); /* Write options */ @@ -819,10 +847,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( size_t capacity); extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( + rocksdb_cache_t* cache, size_t capacity); /* Env */ extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads( rocksdb_env_t* env, int n); extern ROCKSDB_LIBRARY_API void @@ -907,6 +938,15 @@ extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string( const rocksdb_options_t* base_options, const char* opts_str, rocksdb_options_t* new_options, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr); + // referring to convention (3), this should be used by client // to free memory that was malloc()ed extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr); diff --git a/external/rocksdb/include/rocksdb/cache.h b/external/rocksdb/include/rocksdb/cache.h index 4e4b0e19c0..53cb6c60f6 100644 --- a/external/rocksdb/include/rocksdb/cache.h +++ b/external/rocksdb/include/rocksdb/cache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,46 +22,53 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_ #define STORAGE_ROCKSDB_INCLUDE_CACHE_H_ -#include #include +#include #include "rocksdb/slice.h" +#include "rocksdb/status.h" namespace rocksdb { -using std::shared_ptr; - class Cache; // Create a new cache with a fixed size capacity. The cache is sharded -// to 2^numShardBits shards, by hash of the key. The total capacity +// to 2^num_shard_bits shards, by hash of the key. The total capacity // is divided and evenly assigned to each shard. -// -// The functions without parameter numShardBits uses default value, which is 4 -extern shared_ptr NewLRUCache(size_t capacity); -extern shared_ptr NewLRUCache(size_t capacity, int numShardBits); +extern std::shared_ptr NewLRUCache(size_t capacity, + int num_shard_bits = 6, + bool strict_capacity_limit = false); class Cache { public: - Cache() { } + Cache() {} // Destroys all existing entries by calling the "deleter" - // function that was passed to the constructor. - virtual ~Cache(); + // function that was passed via the Insert() function. + // + // @See Insert + virtual ~Cache() {} // Opaque handle to an entry stored in the cache. - struct Handle { }; + struct Handle {}; // Insert a mapping from key->value into the cache and assign it // the specified charge against the total cache capacity. + // If strict_capacity_limit is true and cache reaches its full capacity, + // return Status::Incomplete. // - // Returns a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. // // When the inserted entry is no longer needed, the key and // value will be passed to "deleter". - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) = 0; + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr) = 0; // If the cache has no mapping for "key", returns nullptr. // @@ -85,9 +92,8 @@ class Cache { // underlying entry will be kept around until all existing handles // to it have been released. virtual void Erase(const Slice& key) = 0; - // Return a new numeric id. May be used by multiple clients who are - // sharing the same cache to partition the key space. Typically the + // sharding the same cache to partition the key space. Typically the // client will allocate a new id at startup and prepend the id to // its cache keys. virtual uint64_t NewId() = 0; @@ -98,6 +104,14 @@ class Cache { // purge the released entries from the cache in order to lower the usage virtual void SetCapacity(size_t capacity) = 0; + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + + // Get the flag whether to return error on insertion when cache reaches its + // full capacity. + virtual bool HasStrictCapacityLimit() const = 0; + // returns the maximum configured capacity of the cache virtual size_t GetCapacity() const = 0; @@ -115,8 +129,8 @@ class Cache { // memory - call this only if you're shutting down the process. // Any attempts of using cache after this call will fail terribly. // Always delete the DB object before calling this method! - virtual void DisownData() { - // default implementation is noop + virtual void DisownData(){ + // default implementation is noop }; // Apply callback to all entries in the cache @@ -125,14 +139,14 @@ class Cache { virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), bool thread_safe) = 0; - private: - void LRU_Remove(Handle* e); - void LRU_Append(Handle* e); - void Unref(Handle* e); + // Remove all entries. + // Prerequisit: no entry is referenced. + virtual void EraseUnRefEntries() = 0; + private: // No copying allowed Cache(const Cache&); - void operator=(const Cache&); + Cache& operator=(const Cache&); }; } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/compaction_filter.h b/external/rocksdb/include/rocksdb/compaction_filter.h index 698753c248..acdc3aa1bb 100644 --- a/external/rocksdb/include/rocksdb/compaction_filter.h +++ b/external/rocksdb/include/rocksdb/compaction_filter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -39,6 +39,8 @@ class CompactionFilter { // Is this compaction requested by the client (true), // or is it occurring as an automatic compaction process bool is_manual_compaction; + // Which column family this compaction is for. + uint32_t column_family_id; }; virtual ~CompactionFilter() {} @@ -91,11 +93,27 @@ class CompactionFilter { // The compaction process invokes this method on every merge operand. If this // method returns true, the merge operand will be ignored and not written out // in the compaction output + // + // Note: If you are using a TransactionDB, it is not recommended to implement + // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB + // may not realize there is a write conflict and may allow a Transaction to + // Commit that should have failed. Instead, it is better to implement any + // Merge filtering inside the MergeOperator. virtual bool FilterMergeOperand(int level, const Slice& key, const Slice& operand) const { return false; } + // By default, compaction will only call Filter() on keys written after the + // most recent call to GetSnapshot(). However, if the compaction filter + // overrides IgnoreSnapshots to make it return false, the compaction filter + // will be called even if the keys were written before the last snapshot. + // This behavior is to be used only when we want to delete a set of keys + // irrespective of snapshots. In particular, care should be taken + // to understand that the values of thesekeys will change even if we are + // using a snapshot. + virtual bool IgnoreSnapshots() const { return false; } + // Returns a name that identifies this compaction filter. // The name will be printed to LOG file on start up for diagnosis. virtual const char* Name() const = 0; @@ -114,20 +132,6 @@ class CompactionFilterFactory { virtual const char* Name() const = 0; }; -// Default implementation of CompactionFilterFactory which does not -// return any filter -class DefaultCompactionFilterFactory : public CompactionFilterFactory { - public: - virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { - return std::unique_ptr(nullptr); - } - - virtual const char* Name() const override { - return "DefaultCompactionFilterFactory"; - } -}; - } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ diff --git a/external/rocksdb/include/rocksdb/compaction_job_stats.h b/external/rocksdb/include/rocksdb/compaction_job_stats.h index 5331900157..cfd81f80e9 100644 --- a/external/rocksdb/include/rocksdb/compaction_job_stats.h +++ b/external/rocksdb/include/rocksdb/compaction_job_stats.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -61,7 +61,7 @@ struct CompactionJobStats { uint64_t num_corrupt_keys; // Following counters are only populated if - // options.compaction_measure_io_stats = true; + // options.report_bg_io_stats = true; // Time spent on file's Append() call. uint64_t file_write_nanos; diff --git a/external/rocksdb/include/rocksdb/comparator.h b/external/rocksdb/include/rocksdb/comparator.h index 8fc2710aad..1c67b0d4eb 100644 --- a/external/rocksdb/include/rocksdb/comparator.h +++ b/external/rocksdb/include/rocksdb/comparator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/convenience.h b/external/rocksdb/include/rocksdb/convenience.h index db597279ee..07b267eb5d 100644 --- a/external/rocksdb/include/rocksdb/convenience.h +++ b/external/rocksdb/include/rocksdb/convenience.h @@ -1,12 +1,15 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #pragma once -#include #include +#include +#include + +#include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/table.h" @@ -17,7 +20,7 @@ namespace rocksdb { // base_options, and return the new options as a result. // // If input_strings_escaped is set to true, then each escaped characters -// prefixed by '\' in the the values of the opts_map will be further +// prefixed by '\' in the values of the opts_map will be further // converted back to the raw string before assigning to the associated // options. Status GetColumnFamilyOptionsFromMap( @@ -29,7 +32,7 @@ Status GetColumnFamilyOptionsFromMap( // base_options, and return the new options as a result. // // If input_strings_escaped is set to true, then each escaped characters -// prefixed by '\' in the the values of the opts_map will be further +// prefixed by '\' in the values of the opts_map will be further // converted back to the raw string before assigning to the associated // options. Status GetDBOptionsFromMap( @@ -40,7 +43,14 @@ Status GetDBOptionsFromMap( Status GetBlockBasedTableOptionsFromMap( const BlockBasedTableOptions& table_options, const std::unordered_map& opts_map, - BlockBasedTableOptions* new_table_options); + BlockBasedTableOptions* new_table_options, + bool input_strings_escaped = false); + +Status GetPlainTableOptionsFromMap( + const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options, + bool input_strings_escaped = false); // Take a string representation of option names and values, apply them into the // base_options, and return the new options as a result. The string has the @@ -68,16 +78,35 @@ Status GetStringFromColumnFamilyOptions(std::string* opts_str, const ColumnFamilyOptions& db_options, const std::string& delimiter = "; "); +Status GetStringFromCompressionType(std::string* compression_str, + CompressionType compression_type); + Status GetBlockBasedTableOptionsFromString( const BlockBasedTableOptions& table_options, const std::string& opts_str, BlockBasedTableOptions* new_table_options); +Status GetPlainTableOptionsFromString( + const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options); + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, + std::unique_ptr* new_mem_factory); + Status GetOptionsFromString(const Options& base_options, const std::string& opts_str, Options* new_options); -/// Request stopping background work, if wait is true wait until it's done +// Request stopping background work, if wait is true wait until it's done void CancelAllBackgroundWork(DB* db, bool wait = false); + +// Delete files which are entirely in the given range +// Could leave some keys in the range which are in files which are not +// entirely in the range. +// Snapshots before the delete might not see the data in the given range. +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/db.h b/external/rocksdb/include/rocksdb/db.h index 5a49638bd1..3a1d6b33cc 100644 --- a/external/rocksdb/include/rocksdb/db.h +++ b/external/rocksdb/include/rocksdb/db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,18 +12,20 @@ #include #include #include -#include #include #include -#include "rocksdb/metadata.h" -#include "rocksdb/version.h" +#include +#include "rocksdb/immutable_options.h" #include "rocksdb/iterator.h" -#include "rocksdb/options.h" -#include "rocksdb/types.h" -#include "rocksdb/transaction_log.h" #include "rocksdb/listener.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" #include "rocksdb/snapshot.h" +#include "rocksdb/sst_file_writer.h" #include "rocksdb/thread_status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/version.h" #ifdef _WIN32 // Windows API macro interference @@ -49,14 +51,7 @@ class EventListener; using std::unique_ptr; -class ColumnFamilyHandle { - public: - virtual ~ColumnFamilyHandle() {} - virtual const std::string& GetName() const = 0; - virtual uint32_t GetID() const = 0; -}; extern const std::string kDefaultColumnFamilyName; - struct ColumnFamilyDescriptor { std::string name; ColumnFamilyOptions options; @@ -67,6 +62,23 @@ struct ColumnFamilyDescriptor { : name(_name), options(_options) {} }; +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} + // Returns the name of the column family associated with the current handle. + virtual const std::string& GetName() const = 0; + // Returns the ID of the column family associated with the current handle. + virtual uint32_t GetID() const = 0; + // Fills "*desc" with the up-to-date descriptor of the column family + // associated with this handle. Since it fills "*desc" with the up-to-date + // information, this call might internally lock and release DB mutex to + // access the up-to-date CF options. In addition, all the pointer-typed + // options cannot be referenced any longer than the original options exist. + // + // Note that this function is not supported in RocksDBLite. + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0; +}; + static const int kMajorVersion = __ROCKSDB_MAJOR__; static const int kMinorVersion = __ROCKSDB_MINOR__; @@ -135,7 +147,9 @@ class DB { // in rocksdb::kDefaultColumnFamilyName. // If everything is OK, handles will on return be the same size // as column_families --- handles[i] will be a handle that you - // will use to operate on column family column_family[i] + // will use to operate on column family column_family[i]. + // Before delete DB, you have to close All column families by calling + // DestroyColumnFamilyHandle() with all the handles. static Status Open(const DBOptions& db_options, const std::string& name, const std::vector& column_families, std::vector* handles, DB** dbptr); @@ -161,6 +175,11 @@ class DB { // only records a drop record in the manifest and prevents the column // family from flushing and compacting. virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + // Close a column family specified by column_family handle and destroy + // the column family handle specified to avoid double deletion. This call + // deletes the column family handle by default. Use this method to + // close column family instead of deleting column family handle directly + virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family); // Set the database entry for "key" to "value". // If "key" already exists, it will be overwritten. @@ -188,6 +207,18 @@ class DB { // Remove the database entry for "key". Requires that the key exists // and was not overwritten. Returns OK on success, and a non-OK status // on error. It is not an error if "key" did not exist in the database. + // + // If a key is overwritten (by calling Put() multiple times), then the result + // of calling SingleDelete() on this key is undefined. SingleDelete() only + // behaves correctly if there has been only one Put() for this key since the + // previous call to SingleDelete() for this key. + // + // This feature is currently an experimental performance optimization + // for a very specific workload. It is up to the caller to ensure that + // SingleDelete is only used for a key that is not deleted using Delete() or + // written using Merge(). Mixing SingleDelete operations with Deletes and + // Merges can result in undefined behavior. + // // Note: consider setting options.sync = true. virtual Status SingleDelete(const WriteOptions& options, ColumnFamilyHandle* column_family, @@ -258,9 +289,10 @@ class DB { // This check is potentially lighter-weight than invoking DB::Get(). One way // to make this lighter weight is to avoid doing any IOs. // Default implementation here returns true and sets 'value_found' to false - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, bool* value_found = nullptr) { + virtual bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, std::string* /*value*/, + bool* value_found = nullptr) { if (value_found != nullptr) { *value_found = false; } @@ -303,87 +335,170 @@ class DB { // use "snapshot" after this call. virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; - // DB implementations can export properties about their state - // via this method. If "property" is a valid property understood by this - // DB implementation, fills "*value" with its current value and returns - // true. Otherwise returns false. - // - // - // Valid property names include: - // - // "rocksdb.num-files-at-level" - return the number of files at level , - // where is an ASCII representation of a level number (e.g. "0"). - // "rocksdb.stats" - returns a multi-line string that describes statistics - // about the internal operation of the DB. - // "rocksdb.sstables" - returns a multi-line string that describes all - // of the sstables that make up the db contents. - // "rocksdb.cfstats" - // "rocksdb.dbstats" - // "rocksdb.num-immutable-mem-table" - // "rocksdb.mem-table-flush-pending" - // "rocksdb.compaction-pending" - 1 if at least one compaction is pending - // "rocksdb.background-errors" - accumulated number of background errors - // "rocksdb.cur-size-active-mem-table" -// "rocksdb.size-all-mem-tables" -// "rocksdb.num-entries-active-mem-table" -// "rocksdb.num-entries-imm-mem-tables" -// "rocksdb.num-deletes-active-mem-table" -// "rocksdb.num-deletes-imm-mem-tables" -// "rocksdb.estimate-num-keys" - estimated keys in the column family -// "rocksdb.estimate-table-readers-mem" - estimated memory used for reding -// SST tables, that is not counted as a part of block cache. -// "rocksdb.is-file-deletions-enabled" -// "rocksdb.num-snapshots" -// "rocksdb.oldest-snapshot-time" -// "rocksdb.num-live-versions" - `version` is an internal data structure. -// See version_set.h for details. More live versions often mean more SST -// files are held from being deleted, by iterators or unfinished -// compactions. -// "rocksdb.estimate-live-data-size" -// "rocksdb.total-sst-files-size" - total size of all used sst files, this may -// slow down online queries if there are too many files. -// "rocksdb.base-level" -// "rocksdb.estimate-pending-compaction-bytes" - estimated total number of -// bytes compaction needs to rewrite the data to get all levels down -// to under target size. Not valid for other compactions than level-based. -// "rocksdb.aggregated-table-properties" - returns a string representation of -// the aggregated table properties of the target column family. -// "rocksdb.aggregated-table-properties-at-level", same as the previous -// one but only returns the aggregated table properties of the specified -// level "N" at the target column family. -// replaced by the target level. #ifndef ROCKSDB_LITE + // Contains all valid property arguments for GetProperty(). + // + // NOTE: Property names cannot end in numbers since those are interpreted as + // arguments, e.g., see kNumFilesAtLevelPrefix. struct Properties { + // "rocksdb.num-files-at-level" - returns string containing the number + // of files at level , where is an ASCII representation of a + // level number (e.g., "0"). static const std::string kNumFilesAtLevelPrefix; + + // "rocksdb.compression-ratio-at-level" - returns string containing the + // compression ratio of data at level , where is an ASCII + // representation of a level number (e.g., "0"). Here, compression + // ratio is defined as uncompressed data size / compressed file size. + // Returns "-1.0" if no open files at level . + static const std::string kCompressionRatioAtLevelPrefix; + + // "rocksdb.stats" - returns a multi-line string containing the data + // described by kCFStats followed by the data described by kDBStats. static const std::string kStats; + + // "rocksdb.sstables" - returns a multi-line string summarizing current + // SST files. static const std::string kSSTables; + + // "rocksdb.cfstats" - returns a multi-line string with general column + // family stats per-level over db's lifetime ("L"), aggregated over + // db's lifetime ("Sum"), and aggregated over the interval since the + // last retrieval ("Int"). static const std::string kCFStats; + + // "rocksdb.dbstats" - returns a multi-line string with general database + // stats, both cumulative (over the db's lifetime) and interval (since + // the last retrieval of kDBStats). static const std::string kDBStats; + + // "rocksdb.levelstats" - returns multi-line string containing the number + // of files per level and total size of each level (MB). + static const std::string kLevelStats; + + // "rocksdb.num-immutable-mem-table" - returns number of immutable + // memtables that have not yet been flushed. static const std::string kNumImmutableMemTable; + + // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable + // memtables that have already been flushed. + static const std::string kNumImmutableMemTableFlushed; + + // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is + // pending; otherwise, returns 0. static const std::string kMemTableFlushPending; + + // "rocksdb.num-running-flushes" - returns the number of currently running + // flushes. + static const std::string kNumRunningFlushes; + + // "rocksdb.compaction-pending" - returns 1 if at least one compaction is + // pending; otherwise, returns 0. static const std::string kCompactionPending; + + // "rocksdb.num-running-compactions" - returns the number of currently + // running compactions. + static const std::string kNumRunningCompactions; + + // "rocksdb.background-errors" - returns accumulated number of background + // errors. static const std::string kBackgroundErrors; + + // "rocksdb.cur-size-active-mem-table" - returns approximate size of active + // memtable (bytes). static const std::string kCurSizeActiveMemTable; + + // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active + // and unflushed immutable memtables (bytes). static const std::string kCurSizeAllMemTables; + + // "rocksdb.size-all-mem-tables" - returns approximate size of active, + // unflushed immutable, and pinned immutable memtables (bytes). static const std::string kSizeAllMemTables; + + // "rocksdb.num-entries-active-mem-table" - returns total number of entries + // in the active memtable. static const std::string kNumEntriesActiveMemTable; + + // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries + // in the unflushed immutable memtables. static const std::string kNumEntriesImmMemTables; + + // "rocksdb.num-deletes-active-mem-table" - returns total number of delete + // entries in the active memtable. static const std::string kNumDeletesActiveMemTable; + + // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete + // entries in the unflushed immutable memtables. static const std::string kNumDeletesImmMemTables; + + // "rocksdb.estimate-num-keys" - returns estimated number of total keys in + // the active and unflushed immutable memtables. static const std::string kEstimateNumKeys; + + // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for + // reading SST tables, excluding memory used in block cache (e.g., + // filter and index blocks). static const std::string kEstimateTableReadersMem; + + // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete + // files is enabled; otherwise, returns a non-zero number. static const std::string kIsFileDeletionsEnabled; + + // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the + // database. static const std::string kNumSnapshots; + + // "rocksdb.oldest-snapshot-time" - returns number representing unix + // timestamp of oldest unreleased snapshot. static const std::string kOldestSnapshotTime; + + // "rocksdb.num-live-versions" - returns number of live versions. `Version` + // is an internal data structure. See version_set.h for details. More + // live versions often mean more SST files are held from being deleted, + // by iterators or unfinished compactions. static const std::string kNumLiveVersions; + + // "rocksdb.current-super-version-number" - returns number of curent LSM + // version. It is a uint64_t integer number, incremented after there is + // any change to the LSM tree. The number is not preserved after restarting + // the DB. After DB restart, it will start from 0 again. + static const std::string kCurrentSuperVersionNumber; + + // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of + // live data in bytes. static const std::string kEstimateLiveDataSize; + + // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST + // files. + // WARNING: may slow down online queries if there are too many files. static const std::string kTotalSstFilesSize; + + // "rocksdb.base-level" - returns number of level to which L0 data will be + // compacted. + static const std::string kBaseLevel; + + // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total + // number of bytes compaction needs to rewrite to get all levels down + // to under target size. Not valid for other compactions than level- + // based. static const std::string kEstimatePendingCompactionBytes; + + // "rocksdb.aggregated-table-properties" - returns a string representation + // of the aggregated table properties of the target column family. static const std::string kAggregatedTableProperties; + + // "rocksdb.aggregated-table-properties-at-level", same as the previous + // one but only returns the aggregated table properties of the + // specified level "N" at the target column family. static const std::string kAggregatedTablePropertiesAtLevel; }; #endif /* ROCKSDB_LITE */ + // DB implementations can export properties about their state via this method. + // If "property" is a valid property understood by this DB implementation (see + // Properties struct above for valid options), fills "*value" with its current + // value and returns true. Otherwise, returns false. virtual bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) = 0; virtual bool GetProperty(const Slice& property, std::string* value) { @@ -410,16 +525,24 @@ class DB { // "rocksdb.num-snapshots" // "rocksdb.oldest-snapshot-time" // "rocksdb.num-live-versions" + // "rocksdb.current-super-version-number" // "rocksdb.estimate-live-data-size" // "rocksdb.total-sst-files-size" // "rocksdb.base-level" // "rocksdb.estimate-pending-compaction-bytes" + // "rocksdb.num-running-compactions" + // "rocksdb.num-running-flushes" virtual bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) = 0; virtual bool GetIntProperty(const Slice& property, uint64_t* value) { return GetIntProperty(DefaultColumnFamily(), property, value); } + // Same as GetIntProperty(), but this one returns the aggregated int + // property from all column families. + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) = 0; + // For each i in [0,n-1], store in "sizes[i]", the approximate // file system space used by keys in "[range[i].start .. range[i].limit)". // @@ -465,14 +588,14 @@ class DB { } #if defined(__GNUC__) || defined(__clang__) - __attribute__((deprecated)) + __attribute__((__deprecated__)) #elif _WIN32 __declspec(deprecated) #endif - virtual Status - CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, - const Slice* end, bool change_level = false, - int target_level = -1, uint32_t target_path_id = 0) { + virtual Status + CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, + const Slice* end, bool change_level = false, + int target_level = -1, uint32_t target_path_id = 0) { CompactRangeOptions options; options.change_level = change_level; options.target_level = target_level; @@ -480,14 +603,13 @@ class DB { return CompactRange(options, column_family, begin, end); } #if defined(__GNUC__) || defined(__clang__) - __attribute__((deprecated)) + __attribute__((__deprecated__)) #elif _WIN32 __declspec(deprecated) #endif - virtual Status - CompactRange(const Slice* begin, const Slice* end, - bool change_level = false, int target_level = -1, - uint32_t target_path_id = 0) { + virtual Status + CompactRange(const Slice* begin, const Slice* end, bool change_level = false, + int target_level = -1, uint32_t target_path_id = 0) { CompactRangeOptions options; options.change_level = change_level; options.target_level = target_level; @@ -495,8 +617,9 @@ class DB { return CompactRange(options, DefaultColumnFamily(), begin, end); } - virtual Status SetOptions(ColumnFamilyHandle* column_family, - const std::unordered_map& new_options) { + virtual Status SetOptions( + ColumnFamilyHandle* /*column_family*/, + const std::unordered_map& /*new_options*/) { return Status::NotSupported("Not implemented"); } virtual Status SetOptions( @@ -531,6 +654,18 @@ class DB { virtual Status PauseBackgroundWork() = 0; virtual Status ContinueBackgroundWork() = 0; + // This function will enable automatic compactions for the given column + // families if they were previously disabled. The function will first set the + // disable_auto_compactions option for each column family to 'false', after + // which it will schedule a flush/compaction. + // + // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API + // does NOT schedule a flush/compaction afterwards, and only changes the + // parameter itself within the column family option. + // + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) = 0; + // Number of levels used for this DB. virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } @@ -646,7 +781,8 @@ class DB { // Returns a list of all table files with their level, start key // and end key - virtual void GetLiveFilesMetaData(std::vector* metadata) {} + virtual void GetLiveFilesMetaData( + std::vector* /*metadata*/) {} // Obtains the meta data of the specified column family of the DB. // Status::NotFound() will be returned if the current DB does not have @@ -654,9 +790,8 @@ class DB { // // If cf_name is not specified, then the metadata of the default // column family will be returned. - virtual void GetColumnFamilyMetaData( - ColumnFamilyHandle* column_family, - ColumnFamilyMetaData* metadata) {} + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) {} // Get the metadata of the default column family. void GetColumnFamilyMetaData( @@ -664,33 +799,77 @@ class DB { GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); } - // Load table file located at "file_path" into "column_family", a pointer to - // ExternalSstFileInfo can be used instead of "file_path" to do a blind add - // that wont need to read the file, move_file can be set to true to - // move the file instead of copying it. + // Batch load table files whose paths stored in "file_path_list" into + // "column_family", a vector of ExternalSstFileInfo can be used + // instead of "file_path_list" to do a blind batch add that wont + // need to read the file, move_file can be set to true to + // move the files instead of copying them. // // Current Requirements: - // (1) Memtable is empty. - // (2) All existing files (if any) have sequence number = 0. - // (3) Key range in loaded table file don't overlap with existing - // files key ranges. - // (4) No other writes happen during AddFile call, otherwise - // DB may get corrupted. - // (5) Database have at least 2 levels. + // (1) The key ranges of the files don't overlap with each other + // (1) The key range of any file in list doesn't overlap with + // existing keys or tombstones in DB. + // (2) No snapshots are held. + // + // Notes: We will try to ingest the files to the lowest possible level + // even if the file compression dont match the level compression virtual Status AddFile(ColumnFamilyHandle* column_family, - const std::string& file_path, + const std::vector& file_path_list, bool move_file = false) = 0; - virtual Status AddFile(const std::string& file_path, bool move_file = false) { - return AddFile(DefaultColumnFamily(), file_path, move_file); + virtual Status AddFile(const std::vector& file_path_list, + bool move_file = false) { + return AddFile(DefaultColumnFamily(), file_path_list, move_file); + } +#if defined(__GNUC__) || defined(__clang__) + __attribute__((__deprecated__)) +#elif _WIN32 + __declspec(deprecated) +#endif + virtual Status + AddFile(ColumnFamilyHandle* column_family, const std::string& file_path, + bool move_file = false) { + return AddFile(column_family, std::vector(1, file_path), + move_file); + } +#if defined(__GNUC__) || defined(__clang__) + __attribute__((__deprecated__)) +#elif _WIN32 + __declspec(deprecated) +#endif + virtual Status + AddFile(const std::string& file_path, bool move_file = false) { + return AddFile(DefaultColumnFamily(), + std::vector(1, file_path), move_file); } // Load table file with information "file_info" into "column_family" virtual Status AddFile(ColumnFamilyHandle* column_family, - const ExternalSstFileInfo* file_info, + const std::vector& file_info_list, bool move_file = false) = 0; - virtual Status AddFile(const ExternalSstFileInfo* file_info, + virtual Status AddFile(const std::vector& file_info_list, bool move_file = false) { - return AddFile(DefaultColumnFamily(), file_info, move_file); + return AddFile(DefaultColumnFamily(), file_info_list, move_file); + } +#if defined(__GNUC__) || defined(__clang__) + __attribute__((__deprecated__)) +#elif _WIN32 + __declspec(deprecated) +#endif + virtual Status + AddFile(ColumnFamilyHandle* column_family, + const ExternalSstFileInfo* file_info, bool move_file = false) { + return AddFile(column_family, + std::vector(1, *file_info), move_file); + } +#if defined(__GNUC__) || defined(__clang__) + __attribute__((__deprecated__)) +#elif _WIN32 + __declspec(deprecated) +#endif + virtual Status + AddFile(const ExternalSstFileInfo* file_info, bool move_file = false) { + return AddFile(DefaultColumnFamily(), + std::vector(1, *file_info), move_file); } #endif // ROCKSDB_LITE @@ -709,6 +888,9 @@ class DB { virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { return GetPropertiesOfAllTables(DefaultColumnFamily(), props); } + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) = 0; #endif // ROCKSDB_LITE // Needed for StackableDB @@ -729,7 +911,24 @@ Status DestroyDB(const std::string& name, const Options& options); // resurrect as much of the contents of the database as possible. // Some data may be lost, so be careful when calling this function // on a database that contains important information. +// +// With this API, we will warn and skip data associated with column families not +// specified in column_families. +// +// @param column_families Descriptors for known column families +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families); + +// @param unknown_cf_opts Options for column families encountered during the +// repair that were not specified in column_families. +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts); + +// @param options These options will be used for the database and for ALL column +// families encountered during the repair Status RepairDB(const std::string& dbname, const Options& options); + #endif } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/db_bench_tool.h b/external/rocksdb/include/rocksdb/db_bench_tool.h new file mode 100644 index 0000000000..0e33ae96e2 --- /dev/null +++ b/external/rocksdb/include/rocksdb/db_bench_tool.h @@ -0,0 +1,9 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +namespace rocksdb { +int db_bench_tool(int argc, char** argv); +} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/db_dump_tool.h b/external/rocksdb/include/rocksdb/db_dump_tool.h index 67575a94be..1acc631762 100644 --- a/external/rocksdb/include/rocksdb/db_dump_tool.h +++ b/external/rocksdb/include/rocksdb/db_dump_tool.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/delete_scheduler.h b/external/rocksdb/include/rocksdb/delete_scheduler.h deleted file mode 100644 index 788d592397..0000000000 --- a/external/rocksdb/include/rocksdb/delete_scheduler.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include - -#include "rocksdb/status.h" - -namespace rocksdb { - -class Env; -class Logger; - -// DeleteScheduler allow the DB to enforce a rate limit on file deletion, -// Instead of deleteing files immediately, files are moved to trash_dir -// and deleted in a background thread that apply sleep penlty between deletes -// if they are happening in a rate faster than rate_bytes_per_sec, -// -// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this -// case DeleteScheduler will delete files immediately. -class DeleteScheduler { - public: - virtual ~DeleteScheduler() {} - - // Return delete rate limit in bytes per second - virtual int64_t GetRateBytesPerSecond() = 0; - - // Move file to trash directory and schedule it's deletion - virtual Status DeleteFile(const std::string& fname) = 0; - - // Return a map containing errors that happened in the background thread - // file_path => error status - virtual std::map GetBackgroundErrors() = 0; - - // Wait for all files being deleteing in the background to finish or for - // destructor to be called. - virtual void WaitForEmptyTrash() = 0; -}; - -// Create a new DeleteScheduler that can be shared among multiple RocksDB -// instances to control the file deletion rate. -// -// @env: Pointer to Env object, please see "rocksdb/env.h". -// @trash_dir: Path to the directory where deleted files will be moved into -// to be deleted in a background thread while applying rate limiting. If this -// directory dont exist, it will be created. This directory should not be -// used by any other process or any other DeleteScheduler. -// @rate_bytes_per_sec: How many bytes should be deleted per second, If this -// value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb -// in 1 second, we will wait for another 3 seconds before we delete other -// files, Set to 0 to disable rate limiting. -// @info_log: If not nullptr, info_log will be used to log errors. -// @delete_exisitng_trash: If set to true, the newly created DeleteScheduler -// will delete files that already exist in trash_dir. -// @status: If not nullptr, status will contain any errors that happened during -// creating the missing trash_dir or deleting existing files in trash. -extern DeleteScheduler* NewDeleteScheduler( - Env* env, const std::string& trash_dir, int64_t rate_bytes_per_sec, - std::shared_ptr info_log = nullptr, - bool delete_exisitng_trash = true, Status* status = nullptr); - -} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/env.h b/external/rocksdb/include/rocksdb/env.h index 57c60f0c9a..2d33da27cb 100644 --- a/external/rocksdb/include/rocksdb/env.h +++ b/external/rocksdb/include/rocksdb/env.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -68,6 +68,12 @@ struct EnvOptions { // If true, then use mmap to write data bool use_mmap_writes = true; + // If true, then use O_DIRECT for reading data + bool use_direct_reads = false; + + // If true, then use O_DIRECT for writing data + bool use_direct_writes = false; + // If false, fallocate() calls are bypassed bool allow_fallocate = true; @@ -88,12 +94,29 @@ struct EnvOptions { // WAL writes bool fallocate_with_keep_size = true; + // See DBOPtions doc + size_t compaction_readahead_size; + + // See DBOPtions doc + size_t random_access_max_buffer_size; + + // See DBOptions doc + size_t writable_file_max_buffer_size = 1024 * 1024; + // If not nullptr, write rate limiting is enabled for flush and compaction RateLimiter* rate_limiter = nullptr; }; class Env { public: + struct FileAttributes { + // File name + std::string name; + + // Size of file in bytes + uint64_t size_bytes; + }; + Env() : thread_status_updater_(nullptr) {} virtual ~Env(); @@ -139,6 +162,12 @@ class Env { unique_ptr* result, const EnvOptions& options) = 0; + // Reuse an existing file by renaming it and opening it as writable. + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + unique_ptr* result, + const EnvOptions& options); + // Create an object that represents a directory. Will fail if directory // doesn't exist. If the directory exists, it will open the directory // and create a new Directory object. @@ -162,6 +191,15 @@ class Env { virtual Status GetChildren(const std::string& dir, std::vector* result) = 0; + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildrenFileAttributes(const std::string& dir, + std::vector* result); + // Delete the named file. virtual Status DeleteFile(const std::string& fname) = 0; @@ -229,8 +267,11 @@ class Env { // added to the same Env may run concurrently in different threads. // I.e., the caller may not assume that background work items are // serialized. + // When the UnSchedule function is called, the unschedFunction + // registered at the time of Schedule is invoked with arg as a parameter. virtual void Schedule(void (*function)(void* arg), void* arg, - Priority pri = LOW, void* tag = nullptr) = 0; + Priority pri = LOW, void* tag = nullptr, + void (*unschedFunction)(void* arg) = 0) = 0; // Arrange to remove jobs for given arg from the queue_ if they are not // already scheduled. Caller is expected to have exclusive lock on arg. @@ -402,6 +443,10 @@ class RandomAccessFile { return false; } + // For cases when read-ahead is implemented in the platform dependent + // layer + virtual void EnableReadAhead() {} + // Tries to get an unique ID for this file that will be the same each time // the file is opened (and will stay the same while the file is open). // Furthermore, it tries to make this ID at most "max_size" bytes. If such an @@ -522,7 +567,7 @@ class WritableFile { * underlying storage of a file (generally via fallocate) if the Env * instance supports it. */ - void SetPreallocationBlockSize(size_t size) { + virtual void SetPreallocationBlockSize(size_t size) { preallocation_block_size_ = size; } @@ -551,14 +596,14 @@ class WritableFile { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); } + virtual Status RangeSync(uint64_t offset, uint64_t nbytes) { return Status::OK(); } // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation // of space on devices where it can result in less file // fragmentation and/or less waste from over-zealous filesystem // pre-allocation. - void PrepareWrite(size_t offset, size_t len) { + virtual void PrepareWrite(size_t offset, size_t len) { if (preallocation_block_size_ == 0) { return; } @@ -571,8 +616,8 @@ class WritableFile { if (new_last_preallocated_block > last_preallocated_block_) { size_t num_spanned_blocks = new_last_preallocated_block - last_preallocated_block_; - Allocate(static_cast(block_size * last_preallocated_block_), - static_cast(block_size * num_spanned_blocks)); + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); last_preallocated_block_ = new_last_preallocated_block; } } @@ -581,7 +626,7 @@ class WritableFile { /* * Pre-allocate space for a file. */ - virtual Status Allocate(off_t offset, off_t len) { + virtual Status Allocate(uint64_t offset, uint64_t len) { return Status::OK(); } @@ -596,6 +641,7 @@ class WritableFile { protected: friend class WritableFileWrapper; + friend class WritableFileMirror; Env::IOPriority io_priority_; }; @@ -687,7 +733,7 @@ extern void Error(const shared_ptr& info_log, const char* format, ...); extern void Fatal(const shared_ptr& info_log, const char* format, ...); // Log the specified data to *info_log if info_log is non-nullptr. -// The default info log level is InfoLogLevel::ERROR. +// The default info log level is InfoLogLevel::INFO_LEVEL. extern void Log(const shared_ptr& info_log, const char* format, ...) # if defined(__GNUC__) || defined(__clang__) __attribute__((__format__ (__printf__, 2, 3))) @@ -699,7 +745,7 @@ extern void LogFlush(Logger *info_log); extern void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, ...); -// The default info log level is InfoLogLevel::ERROR. +// The default info log level is InfoLogLevel::INFO_LEVEL. extern void Log(Logger* info_log, const char* format, ...) # if defined(__GNUC__) || defined(__clang__) __attribute__((__format__ (__printf__, 2, 3))) @@ -749,6 +795,12 @@ class EnvWrapper : public Env { const EnvOptions& options) override { return target_->NewWritableFile(f, r, options); } + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + unique_ptr* r, + const EnvOptions& options) override { + return target_->ReuseWritableFile(fname, old_fname, r, options); + } virtual Status NewDirectory(const std::string& name, unique_ptr* result) override { return target_->NewDirectory(name, result); @@ -760,6 +812,10 @@ class EnvWrapper : public Env { std::vector* r) override { return target_->GetChildren(dir, r); } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { + return target_->GetChildrenFileAttributes(dir, result); + } Status DeleteFile(const std::string& f) override { return target_->DeleteFile(f); } @@ -796,8 +852,8 @@ class EnvWrapper : public Env { Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); } void Schedule(void (*f)(void* arg), void* a, Priority pri, - void* tag = nullptr) override { - return target_->Schedule(f, a, pri, tag); + void* tag = nullptr, void (*u)(void* arg) = 0) override { + return target_->Schedule(f, a, pri, tag, u); } int UnSchedule(void* tag, Priority pri) override { @@ -900,11 +956,18 @@ class WritableFileWrapper : public WritableFile { return target_->InvalidateCache(offset, length); } + virtual void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + virtual void PrepareWrite(size_t offset, size_t len) override { + target_->PrepareWrite(offset, len); + } + protected: - Status Allocate(off_t offset, off_t len) override { + Status Allocate(uint64_t offset, uint64_t len) override { return target_->Allocate(offset, len); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status RangeSync(uint64_t offset, uint64_t nbytes) override { return target_->RangeSync(offset, nbytes); } @@ -918,6 +981,10 @@ class WritableFileWrapper : public WritableFile { // *base_env must remain live while the result is in use. Env* NewMemEnv(Env* base_env); +// Returns a new environment that is used for HDFS environment. +// This is a factory method for HdfsEnv declared in hdfs/env_hdfs.h +Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname); + } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_ diff --git a/external/rocksdb/include/rocksdb/experimental.h b/external/rocksdb/include/rocksdb/experimental.h index 1d02e0238a..70ad0b914b 100644 --- a/external/rocksdb/include/rocksdb/experimental.h +++ b/external/rocksdb/include/rocksdb/experimental.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/filter_policy.h b/external/rocksdb/include/rocksdb/filter_policy.h index 90aefb388b..2c1588a232 100644 --- a/external/rocksdb/include/rocksdb/filter_policy.h +++ b/external/rocksdb/include/rocksdb/filter_policy.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/flush_block_policy.h b/external/rocksdb/include/rocksdb/flush_block_policy.h index 939725cf40..022e0be4a9 100644 --- a/external/rocksdb/include/rocksdb/flush_block_policy.h +++ b/external/rocksdb/include/rocksdb/flush_block_policy.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/immutable_options.h b/external/rocksdb/include/rocksdb/immutable_options.h index 589f14e999..9417aa2e97 100644 --- a/external/rocksdb/include/rocksdb/immutable_options.h +++ b/external/rocksdb/include/rocksdb/immutable_options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -50,6 +50,8 @@ struct ImmutableCFOptions { Env* env; + uint64_t delayed_write_rate; + // Allow the OS to mmap file for reading sst tables. Default: false bool allow_mmap_reads; @@ -73,16 +75,14 @@ struct ImmutableCFOptions { bool purge_redundant_kvs_while_flush; - uint32_t min_partial_merge_operands; - bool disable_data_sync; bool use_fsync; - CompressionType compression; - std::vector compression_per_level; + CompressionType bottommost_compression; + CompressionOptions compression_opts; bool level_compaction_dynamic_level_bytes; diff --git a/external/rocksdb/include/rocksdb/iostats_context.h b/external/rocksdb/include/rocksdb/iostats_context.h index e81092b52c..632fe44c82 100644 --- a/external/rocksdb/include/rocksdb/iostats_context.h +++ b/external/rocksdb/include/rocksdb/iostats_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,7 +18,7 @@ struct IOStatsContext { // reset all io-stats counter to zero void Reset(); - std::string ToString() const; + std::string ToString(bool exclude_zero_counters = false) const; // the thread pool id uint64_t thread_pool_id; diff --git a/external/rocksdb/include/rocksdb/iterator.h b/external/rocksdb/include/rocksdb/iterator.h index 7538e9cfb5..c39ce46413 100644 --- a/external/rocksdb/include/rocksdb/iterator.h +++ b/external/rocksdb/include/rocksdb/iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,15 +19,38 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ #define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ +#include #include "rocksdb/slice.h" #include "rocksdb/status.h" namespace rocksdb { -class Iterator { +class Cleanable { public: - Iterator(); - virtual ~Iterator(); + Cleanable(); + ~Cleanable(); + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + protected: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; +}; + +class Iterator : public Cleanable { + public: + Iterator() {} + virtual ~Iterator() {} // An iterator is either positioned at a key/value pair, or // not valid. This method returns true iff the iterator is valid. @@ -73,23 +96,19 @@ class Iterator { // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // Clients are allowed to register function/arg1/arg2 triples that - // will be invoked when this iterator is destroyed. - // - // Note that unlike all of the preceding methods, this method is - // not abstract and therefore clients should not override it. - typedef void (*CleanupFunction)(void* arg1, void* arg2); - void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + // Property "rocksdb.iterator.is-key-pinned": + // If returning "1", this means that the Slice returned by key() is valid + // as long as the iterator is not deleted. + // It is guaranteed to always return "1" if + // - Iterator created with ReadOptions::pin_data = true + // - DB tables were created with + // BlockBasedTableOptions::use_delta_encoding = false. + // Property "rocksdb.iterator.super-version-number": + // LSM version used by the iterator. The same format as DB Property + // kCurrentSuperVersionNumber. See its comment for more information. + virtual Status GetProperty(std::string prop_name, std::string* prop); private: - struct Cleanup { - CleanupFunction function; - void* arg1; - void* arg2; - Cleanup* next; - }; - Cleanup cleanup_; - // No copying allowed Iterator(const Iterator&); void operator=(const Iterator&); diff --git a/external/rocksdb/include/rocksdb/ldb_tool.h b/external/rocksdb/include/rocksdb/ldb_tool.h index 1b1c64b067..8a6918ba47 100644 --- a/external/rocksdb/include/rocksdb/ldb_tool.h +++ b/external/rocksdb/include/rocksdb/ldb_tool.h @@ -1,10 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #ifndef ROCKSDB_LITE #pragma once #include +#include +#include "rocksdb/db.h" #include "rocksdb/options.h" namespace rocksdb { @@ -28,8 +30,10 @@ struct LDBOptions { class LDBTool { public: - void Run(int argc, char** argv, Options db_options= Options(), - const LDBOptions& ldb_options = LDBOptions()); + void Run( + int argc, char** argv, Options db_options = Options(), + const LDBOptions& ldb_options = LDBOptions(), + const std::vector* column_families = nullptr); }; } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/listener.h b/external/rocksdb/include/rocksdb/listener.h index f693d5c9b3..fde0db592d 100644 --- a/external/rocksdb/include/rocksdb/listener.h +++ b/external/rocksdb/include/rocksdb/listener.h @@ -4,7 +4,9 @@ #pragma once +#include #include +#include #include #include "rocksdb/compaction_job_stats.h" #include "rocksdb/status.h" @@ -12,29 +14,65 @@ namespace rocksdb { +typedef std::unordered_map> + TablePropertiesCollection; + class DB; class Status; struct CompactionJobStats; +enum CompressionType : unsigned char; -struct TableFileCreationInfo { - TableFileCreationInfo() = default; - explicit TableFileCreationInfo(TableProperties&& prop) : - table_properties(prop) {} +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, +}; + +struct TableFileCreationBriefInfo { // the name of the database where the file was created std::string db_name; // the name of the column family where the file was created. std::string cf_name; // the path to the created file. std::string file_path; - // the size of the file. - uint64_t file_size; // the id of the job (which could be flush or compaction) that // created the file. int job_id; + // reason of creating the table. + TableFileCreationReason reason; +}; + +struct TableFileCreationInfo : public TableFileCreationBriefInfo { + TableFileCreationInfo() = default; + explicit TableFileCreationInfo(TableProperties&& prop) + : table_properties(prop) {} + // the size of the file. + uint64_t file_size; // Detailed properties of the created file. TableProperties table_properties; + // The status indicating whether the creation was successful or not. + Status status; }; +enum class CompactionReason { + kUnknown, + // [Level] number of L0 files > level0_file_num_compaction_trigger + kLevelL0FilesNum, + // [Level] total size of level > MaxBytesForLevel() + kLevelMaxLevelSize, + // [Universal] Compacting for size amplification + kUniversalSizeAmplification, + // [Universal] Compacting for size ratio + kUniversalSizeRatio, + // [Universal] number of sorted runs > level0_file_num_compaction_trigger + kUniversalSortedRunNum, + // [FIFO] total size > max_table_files_size + kFIFOMaxSize, + // Manual compaction + kManualCompaction, + // DB::SuggestCompactRange() marked files for compaction + kFilesMarkedForCompaction, +}; #ifndef ROCKSDB_LITE @@ -45,7 +83,7 @@ struct TableFileDeletionInfo { std::string file_path; // The id of the job which deleted the file. int job_id; - // The status indicating whether the deletion was successfull or not. + // The status indicating whether the deletion was successful or not. Status status; }; @@ -72,6 +110,8 @@ struct FlushJobInfo { SequenceNumber smallest_seqno; // The largest sequence number in the newly created file SequenceNumber largest_seqno; + // Table properties of the table being flushed + TableProperties table_properties; }; struct CompactionJobInfo { @@ -93,13 +133,42 @@ struct CompactionJobInfo { int output_level; // the names of the compaction input files. std::vector input_files; + // the names of the compaction output files. std::vector output_files; + // Table properties for input and output tables. + // The map is keyed by values from input_files and output_files. + TablePropertiesCollection table_properties; + + // Reason to run the compaction + CompactionReason compaction_reason; + + // Compression algorithm used for output files + CompressionType compression; + // If non-null, this variable stores detailed information // about this compaction. CompactionJobStats stats; }; +struct MemTableInfo { + // the name of the column family to which memtable belongs + std::string cf_name; + // Sequence number of the first element that was inserted + // into the memtable. + SequenceNumber first_seqno; + // Sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + SequenceNumber earliest_seqno; + // Total number of entries in memtable + uint64_t num_entries; + // Total number of deletes in memtable + uint64_t num_deletes; + +}; + // EventListener class contains a set of call-back functions that will // be called when specific RocksDB event happens such as flush. It can // be used as a building block for developing custom features such as @@ -138,8 +207,8 @@ class EventListener { // Note that the this function must be implemented in a way such that // it should not run for an extended period of time before the function // returns. Otherwise, RocksDB may be blocked. - virtual void OnFlushCompleted( - DB* db, const FlushJobInfo& flush_job_info) {} + virtual void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} // A call-back function for RocksDB which will be called whenever // a SST file is deleted. Different from OnCompactionCompleted and @@ -152,8 +221,7 @@ class EventListener { // Note that if applications would like to use the passed reference // outside this function call, they should make copies from the // returned value. - virtual void OnTableFileDeleted( - const TableFileDeletionInfo& info) {} + virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {} // A call-back function for RocksDB which will be called whenever // a registered RocksDB compacts a file. The default implementation @@ -168,7 +236,8 @@ class EventListener { // @param ci a reference to a CompactionJobInfo struct. 'ci' is released // after this function is returned, and must be copied if it is needed // outside of this function. - virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {} + virtual void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*ci*/) {} // A call-back function for RocksDB which will be called whenever // a SST file is created. Different from OnCompactionCompleted and @@ -178,11 +247,37 @@ class EventListener { // on file creations and deletions is suggested to implement // OnFlushCompleted and OnCompactionCompleted. // + // Historically it will only be called if the file is successfully created. + // Now it will also be called on failure case. User can check info.status + // to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {} + + // A call-back function for RocksDB which will be called before + // a SST file is being created. It will follow by OnTableFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*info*/) {} + + // A call-back function for RocksDB which will be called before + // a memtable is made immutable. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // // Note that if applications would like to use the passed reference // outside this function call, they should make copies from these // returned value. - virtual void OnTableFileCreated( - const TableFileCreationInfo& info) {} + virtual void OnMemTableSealed( + const MemTableInfo& /*info*/) {} virtual ~EventListener() {} }; diff --git a/external/rocksdb/include/rocksdb/memtablerep.h b/external/rocksdb/include/rocksdb/memtablerep.h index f02c2d094d..f6f0309469 100644 --- a/external/rocksdb/include/rocksdb/memtablerep.h +++ b/external/rocksdb/include/rocksdb/memtablerep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -36,7 +36,9 @@ #pragma once #include +#include #include +#include namespace rocksdb { @@ -68,25 +70,36 @@ class MemTableRep { explicit MemTableRep(MemTableAllocator* allocator) : allocator_(allocator) {} - // Allocate a buf of len size for storing key. The idea is that a specific - // memtable representation knows its underlying data structure better. By - // allowing it to allocate memory, it can possibly put correlated stuff - // in consecutive memory area to make processor prefetching more efficient. + // Allocate a buf of len size for storing key. The idea is that a + // specific memtable representation knows its underlying data structure + // better. By allowing it to allocate memory, it can possibly put + // correlated stuff in consecutive memory area to make processor + // prefetching more efficient. virtual KeyHandle Allocate(const size_t len, char** buf); // Insert key into the collection. (The caller will pack key and value into a // single buffer and pass that in as the parameter to Insert). // REQUIRES: nothing that compares equal to key is currently in the - // collection. + // collection, and no concurrent modifications to the table in progress virtual void Insert(KeyHandle handle) = 0; + // Like Insert(handle), but may be called concurrent with other calls + // to InsertConcurrently for other handles + virtual void InsertConcurrently(KeyHandle handle) { +#ifndef ROCKSDB_LITE + throw std::runtime_error("concurrent insert not supported"); +#else + abort(); +#endif + } + // Returns true iff an entry that compares equal to key is in the collection. virtual bool Contains(const char* key) const = 0; - // Notify this table rep that it will no longer be added to. By default, does - // nothing. After MarkReadOnly() is called, this table rep will not be - // written to (ie No more calls to Allocate(), Insert(), or any writes done - // directly to entries accessed through the iterator.) + // Notify this table rep that it will no longer be added to. By default, + // does nothing. After MarkReadOnly() is called, this table rep will + // not be written to (ie No more calls to Allocate(), Insert(), + // or any writes done directly to entries accessed through the iterator.) virtual void MarkReadOnly() { } // Look up key from the mem table, since the first key in the mem table whose @@ -94,6 +107,7 @@ class MemTableRep { // callback_args directly forwarded as the first parameter, and the mem table // key as the second parameter. If the return value is false, then terminates. // Otherwise, go through the next key. + // // It's safe for Get() to terminate after having finished all the potential // key for the k.user_key(), or not. // @@ -109,7 +123,7 @@ class MemTableRep { } // Report an approximation of how much memory has been used other than memory - // that was allocated through the allocator. + // that was allocated through the allocator. Safe to call from any thread. virtual size_t ApproximateMemoryUsage() = 0; virtual ~MemTableRep() { } @@ -192,6 +206,10 @@ class MemTableRepFactory { const SliceTransform*, Logger* logger) = 0; virtual const char* Name() const = 0; + + // Return true if the current MemTableRep supports concurrent inserts + // Default: false + virtual bool IsInsertConcurrentlySupported() const { return false; } }; // This uses a skip list to store keys. It is the default. @@ -211,6 +229,8 @@ class SkipListFactory : public MemTableRepFactory { Logger* logger) override; virtual const char* Name() const override { return "SkipListFactory"; } + bool IsInsertConcurrentlySupported() const override { return true; } + private: const size_t lookahead_; }; diff --git a/external/rocksdb/include/rocksdb/merge_operator.h b/external/rocksdb/include/rocksdb/merge_operator.h index 05b66f2025..b3ca013b74 100644 --- a/external/rocksdb/include/rocksdb/merge_operator.h +++ b/external/rocksdb/include/rocksdb/merge_operator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,6 +9,7 @@ #include #include #include +#include #include "rocksdb/slice.h" @@ -32,7 +33,7 @@ class Logger; // into rocksdb); numeric addition and string concatenation are examples; // // b) MergeOperator - the generic class for all the more abstract / complex -// operations; one method (FullMerge) to merge a Put/Delete value with a +// operations; one method (FullMergeV2) to merge a Put/Delete value with a // merge operand; and another method (PartialMerge) that merges multiple // operands together. this is especially useful if your key values have // complex structures but you would still like to support client-specific @@ -69,7 +70,49 @@ class MergeOperator { const Slice* existing_value, const std::deque& operand_list, std::string* new_value, - Logger* logger) const = 0; + Logger* logger) const { + // deprecated, please use FullMergeV2() + assert(false); + return false; + } + + struct MergeOperationInput { + explicit MergeOperationInput(const Slice& _key, + const Slice* _existing_value, + const std::vector& _operand_list, + Logger* _logger) + : key(_key), + existing_value(_existing_value), + operand_list(_operand_list), + logger(_logger) {} + + // The key associated with the merge operation. + const Slice& key; + // The existing value of the current key, nullptr means that the + // value dont exist. + const Slice* existing_value; + // A list of operands to apply. + const std::vector& operand_list; + // Logger could be used by client to log any errors that happen during + // the merge operation. + Logger* logger; + }; + + struct MergeOperationOutput { + explicit MergeOperationOutput(std::string& _new_value, + Slice& _existing_operand) + : new_value(_new_value), existing_operand(_existing_operand) {} + + // Client is responsible for filling the merge result here. + std::string& new_value; + // If the merge result is one of the existing operands (or existing_value), + // client can set this field to the operand (or existing_value) instead of + // using new_value. + Slice& existing_operand; + }; + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const; // This function performs merge(left_op, right_op) // when both the operands are themselves merge operation types @@ -99,7 +142,7 @@ class MergeOperator { // TODO: Presently there is no way to differentiate between error/corruption // and simply "return false". For now, the client should simply return // false in any case it cannot perform partial-merge, regardless of reason. - // If there is corruption in the data, handle it in the FullMerge() function, + // If there is corruption in the data, handle it in the FullMergeV2() function // and return false there. The default implementation of PartialMerge will // always return false. virtual bool PartialMerge(const Slice& key, const Slice& left_operand, @@ -171,11 +214,8 @@ class AssociativeMergeOperator : public MergeOperator { private: // Default implementations of the MergeOperator functions - virtual bool FullMerge(const Slice& key, - const Slice* existing_value, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override; + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; virtual bool PartialMerge(const Slice& key, const Slice& left_operand, diff --git a/external/rocksdb/include/rocksdb/metadata.h b/external/rocksdb/include/rocksdb/metadata.h index 7cdf4a1a93..5425146d7b 100644 --- a/external/rocksdb/include/rocksdb/metadata.h +++ b/external/rocksdb/include/rocksdb/metadata.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/options.h b/external/rocksdb/include/rocksdb/options.h index 16aa3782bd..ac14fa570a 100644 --- a/external/rocksdb/include/rocksdb/options.h +++ b/external/rocksdb/include/rocksdb/options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,9 +17,10 @@ #include #include -#include "rocksdb/version.h" #include "rocksdb/listener.h" #include "rocksdb/universal_compaction.h" +#include "rocksdb/version.h" +#include "rocksdb/write_buffer_manager.h" #ifdef max #undef max @@ -33,6 +34,7 @@ class CompactionFilterFactory; class Comparator; class Env; enum InfoLogLevel : unsigned char; +class SstFileManager; class FilterPolicy; class Logger; class MergeOperator; @@ -41,17 +43,17 @@ class TableFactory; class MemTableRepFactory; class TablePropertiesCollectorFactory; class RateLimiter; -class DeleteScheduler; class Slice; class SliceTransform; class Statistics; class InternalKeyComparator; +class WalFilter; // DB contents are stored in a set of blocks, each of which holds a // sequence of key,value pairs. Each block may be compressed before // being stored in a file. The following enum describes which // compression method (if any) is used to compress a block. -enum CompressionType : char { +enum CompressionType : unsigned char { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. kNoCompression = 0x0, @@ -60,8 +62,12 @@ enum CompressionType : char { kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5, + kXpressCompression = 0x6, // zstd format is not finalized yet so it's subject to changes. kZSTDNotFinalCompression = 0x40, + + // kDisableCompressionOption is used to disable some compression options. + kDisableCompressionOption = 0xff, }; enum CompactionStyle : char { @@ -79,11 +85,23 @@ enum CompactionStyle : char { kCompactionStyleNone = 0x3, }; +// In Level-based comapction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. enum CompactionPri : char { // Slightly Priotize larger files by size compensated by #deletes - kCompactionPriByCompensatedSize = 0x0, - // First compact files whose data is oldest. - kCompactionPriByLargestSeq = 0x1, + kByCompensatedSize = 0x0, + // First compact files whose data's latest update time is oldest. + // Try this if you only update some hot keys in small ranges. + kOldestLargestSeqFirst = 0x1, + // First compact files whose range hasn't been compacted to the next level + // for the longest. If your updates are random across the key space, + // write amplification is slightly better with this option. + kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, }; enum class WALRecoveryMode : char { @@ -123,9 +141,23 @@ struct CompressionOptions { int window_bits; int level; int strategy; - CompressionOptions() : window_bits(-14), level(-1), strategy(0) {} - CompressionOptions(int wbits, int _lev, int _strategy) - : window_bits(wbits), level(_lev), strategy(_strategy) {} + // Maximum size of dictionary used to prime the compression library. Currently + // this dictionary will be constructed by sampling the first output file in a + // subcompaction when the target level is bottommost. This dictionary will be + // loaded into the compression library before compressing/uncompressing each + // data block of subsequent files in the subcompaction. Effectively, this + // improves compression ratios when there are repetitions across data blocks. + // A value of 0 indicates the feature is disabled. + // Default: 0. + uint32_t max_dict_bytes; + + CompressionOptions() + : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {} + CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes) + : window_bits(wbits), + level(_lev), + strategy(_strategy), + max_dict_bytes(_max_dict_bytes) {} }; enum UpdateStatus { // Return status For inplace update callback @@ -145,7 +177,15 @@ struct DbPath { struct Options; struct ColumnFamilyOptions { + // The function recovers options to a previous version. Only 4.6 or later + // versions are supported. + ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + // Some functions that make it easier to optimize RocksDB + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + ColumnFamilyOptions* OptimizeForSmallDb(); // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls @@ -241,7 +281,7 @@ struct ColumnFamilyOptions { // Note that write_buffer_size is enforced per column family. // See db_write_buffer_size for sharing memory across column families. // - // Default: 4MB + // Default: 64MB // // Dynamically changeable through SetOptions() API size_t write_buffer_size; @@ -250,6 +290,9 @@ struct ColumnFamilyOptions { // The default and the minimum number is 2, so that when 1 write buffer // is being flushed to storage, new writes can continue to the other // write buffer. + // If max_write_buffer_number > 3, writing will be slowed down to + // options.delayed_write_rate if we are writing to the last write buffer + // allowed. // // Default: 2 // @@ -258,7 +301,7 @@ struct ColumnFamilyOptions { // The minimum number of write buffers that will be merged together // before writing to storage. If set to 1, then - // all write buffers are fushed to L0 as individual files and this increases + // all write buffers are flushed to L0 as individual files and this increases // read amplification because a get request has to check in all of these // files. Also, an in-memory merge may result in writing lesser // data to storage if there are duplicate records in each of these @@ -270,9 +313,17 @@ struct ColumnFamilyOptions { // max_write_buffer_number, this parameter does not affect flushing. // This controls the minimum amount of write history that will be available // in memory for conflict checking when Transactions are used. + // + // When using an OptimisticTransactionDB: // If this value is too low, some transactions may fail at commit time due // to not being able to determine whether there were any write conflicts. // + // When using a TransactionDB: + // If Transaction::SetSnapshot is used, TransactionDB will read either + // in-memory write buffers or SST files to do write-conflict checking. + // Increasing this value can reduce the number of reads to SST files + // done for conflict detection. + // // Setting this value to 0 will cause write buffers to be freed immediately // after they are flushed. // If this value is set to -1, 'max_write_buffer_number' will be used. @@ -322,6 +373,13 @@ struct ColumnFamilyOptions { // change when data grows. std::vector compression_per_level; + // Compression algorithm that will be used for the bottommost level that + // contain files. If level-compaction is used, this option will only affect + // levels after base level. + // + // Default: kDisableCompressionOption (Disabled) + CompressionType bottommost_compression; + // different options for compression algorithms CompressionOptions compression_opts; @@ -376,7 +434,7 @@ struct ColumnFamilyOptions { // be 2MB, and each file on level 2 will be 20MB, // and each file on level-3 will be 200MB. // - // Default: 2MB. + // Default: 64MB. // // Dynamically changeable through SetOptions() API uint64_t target_file_size_base; @@ -391,12 +449,12 @@ struct ColumnFamilyOptions { // max_bytes_for_level_base is the max total for level-1. // Maximum number of bytes for level L can be calculated as // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) - // For example, if max_bytes_for_level_base is 20MB, and if + // For example, if max_bytes_for_level_base is 200MB, and if // max_bytes_for_level_multiplier is 10, total data size for level-1 - // will be 20MB, total file size for level-2 will be 200MB, - // and total file size for level-3 will be 2GB. + // will be 200MB, total file size for level-2 will be 2GB, + // and total file size for level-3 will be 20GB. // - // Default: 10MB. + // Default: 256MB. // // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base; @@ -499,6 +557,7 @@ struct ColumnFamilyOptions { // Dynamically changeable through SetOptions() API int max_grandparent_overlap_factor; + // DEPRECATED -- this options is no longer used // Puts are delayed to options.delayed_write_rate when any level has a // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. // @@ -507,13 +566,19 @@ struct ColumnFamilyOptions { // Dynamically changeable through SetOptions() API double soft_rate_limit; - // DEPRECATED -- this options is no longer usde + // DEPRECATED -- this options is no longer used double hard_rate_limit; + // All writes will be slowed down to at least delayed_write_rate if estimated + // bytes needed to be compaction exceed this threshold. + // + // Default: 64GB + uint64_t soft_pending_compaction_bytes_limit; + // All writes are stopped if estimated bytes needed to be compaction exceed // this threshold. // - // Default: 0 (disabled) + // Default: 256GB uint64_t hard_pending_compaction_bytes_limit; // DEPRECATED -- this options is no longer used @@ -523,7 +588,7 @@ struct ColumnFamilyOptions { // If <= 0, a proper value is automatically calculated (usually 1/8 of // writer_buffer_size, rounded up to a multiple of 4KB). // - // There are two additonal restriction of the The specified size: + // There are two additional restriction of the The specified size: // (1) size should be in the range of [4096, 2 << 30] and // (2) be the multiple of the CPU word (which helps with the memory // alignment). @@ -551,7 +616,7 @@ struct ColumnFamilyOptions { // If level compaction_style = kCompactionStyleLevel, for each level, // which files are prioritized to be picked to compact. - // Default: kCompactionPriByCompensatedSize + // Default: kByCompensatedSize CompactionPri compaction_pri; // If true, compaction will verify checksum on every read that happens @@ -568,16 +633,6 @@ struct ColumnFamilyOptions { // The options for FIFO compaction style CompactionOptionsFIFO compaction_options_fifo; - // Use KeyMayExist API to filter deletes when this is true. - // If KeyMayExist returns false, i.e. the key definitely does not exist, then - // the delete is a noop. KeyMayExist only incurs in-memory look up. - // This optimization avoids writing the delete to storage when appropriate. - // - // Default: false - // - // Dynamically changeable through SetOptions() API - bool filter_deletes; - // An iteration->Next() sequentially skips over keys with the same // user-key unless this option is set. This number specifies the number // of keys (with the same userkey) that will be sequentially @@ -612,7 +667,7 @@ struct ColumnFamilyOptions { // If you'd like to customize some of these options, you will need to // use NewBlockBasedTableFactory() to construct a new table factory. - // This option allows user to to collect their own interested statistics of + // This option allows user to collect their own interested statistics of // the tables. // Default: empty vector -- no user-defined statistics collection will be // performed. @@ -681,25 +736,27 @@ struct ColumnFamilyOptions { Slice delta_value, std::string* merged_value); - // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom - // for memtable + // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, + // create prefix bloom for memtable with the size of + // write_buffer_size * memtable_prefix_bloom_size_ratio. + // If it is larger than 0.25, it is santinized to 0.25. // - // Dynamically changeable through SetOptions() API - uint32_t memtable_prefix_bloom_bits; - - // number of hash probes per key + // Default: 0 (disable) // // Dynamically changeable through SetOptions() API - uint32_t memtable_prefix_bloom_probes; + double memtable_prefix_bloom_size_ratio; - // Page size for huge page TLB for bloom in memtable. If <=0, not allocate - // from huge page TLB but from malloc. - // Need to reserve huge pages for it to be allocated. For example: + // Page size for huge page for the arena used by the memtable. If <=0, it + // won't allocate from huge page but from malloc. + // Users are responsible to reserve huge pages for it to be allocated. For + // example: // sysctl -w vm.nr_hugepages=20 // See linux doc Documentation/vm/hugetlbpage.txt + // If there isn't enough free huge page available, it will fall back to + // malloc. // // Dynamically changeable through SetOptions() API - size_t memtable_prefix_bloom_huge_page_tlb_size; + size_t memtable_huge_page_size; // Control locality of bloom filter probes to improve cache miss rate. // This option only applies to memtable prefix bloom and plaintable @@ -751,9 +808,9 @@ struct ColumnFamilyOptions { // Default: false bool paranoid_file_checks; - // Measure IO stats in compactions, if true. + // Measure IO stats in compactions and flushes, if true. // Default: false - bool compaction_measure_io_stats; + bool report_bg_io_stats; // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); @@ -764,8 +821,16 @@ struct ColumnFamilyOptions { }; struct DBOptions { + // The function recovers options to the option as in version 4.6. + DBOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + // Some functions that make it easier to optimize RocksDB + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + DBOptions* OptimizeForSmallDb(); + #ifndef ROCKSDB_LITE // By default, RocksDB uses only one background thread for flush and // compaction. Calling this function will set it up such that total of @@ -806,12 +871,21 @@ struct DBOptions { // Default: nullptr std::shared_ptr rate_limiter; - // Use to control files deletion rate, can be used among multiple - // RocksDB instances. delete_scheduler is only used to delete table files that - // need to be deleted from the first db_path (db_name if db_paths is empty), - // other files types and other db_paths wont be affected by delete_scheduler. - // Default: nullptr (disabled) - std::shared_ptr delete_scheduler; + // Use to track SST files and control their file deletion rate. + // + // Features: + // - Throttle the deletion rate of the SST files. + // - Keep track the total size of all SST files. + // - Set a maximum allowed space limit for SST files that when reached + // the DB wont do any further flushes or compactions and will set the + // background error. + // - Can be shared between multiple dbs. + // Limitations: + // - Only track and throttle deletes of SST files in + // first db_path (db_name if db_paths is empty). + // + // Default: nullptr + std::shared_ptr sst_file_manager; // Any internal progress/error information generated by the db will // be written to info_log if it is non-nullptr, or to a file stored @@ -826,12 +900,12 @@ struct DBOptions { // files opened are always kept open. You can estimate number of files based // on target_file_size_base and target_file_size_multiplier for level-based // compaction. For universal-style compaction, you can usually set it to -1. - // Default: 5000 or ulimit value of max open files (whichever is smaller) + // Default: -1 int max_open_files; // If max_open_files is -1, DB will open all files on DB::Open(). You can // use this option to increase the number of threads used to open the files. - // Default: 1 + // Default: 16 int max_file_opening_threads; // Once write-ahead logs exceed this size, we will start forcing the flush of @@ -880,7 +954,7 @@ struct DBOptions { // If none of the paths has sufficient room to place a file, the file will // be placed to the last path anyway, despite to the target size. // - // Placing newer data to ealier paths is also best-efforts. User should + // Placing newer data to earlier paths is also best-efforts. User should // expect user files to be placed in higher levels in some extreme cases. // // If left empty, only one path will be used, which is db_name passed when @@ -909,8 +983,19 @@ struct DBOptions { // regardless of this setting uint64_t delete_obsolete_files_period_micros; + // Suggested number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // + // Default: 1 + int base_background_compactions; + // Maximum number of concurrent background compaction jobs, submitted to // the default LOW priority thread pool. + // We first try to schedule compactions based on + // `base_background_compactions`. If the compaction cannot catch up , we + // will increase number of compaction threads up to + // `max_background_compactions`. + // // If you're increasing this, also consider increasing number of threads in // LOW priority thread pool. For more information, see // Env::SetBackgroundThreads @@ -957,6 +1042,16 @@ struct DBOptions { // Default: 1000 size_t keep_log_file_num; + // Recycle log files. + // If non-zero, we will reuse previously written log files for new + // logs, overwriting the old data. The value indicates how many + // such files we will keep around at any point in time for later + // use. This is more efficient because the blocks are already + // allocated and fdatasync does not need to update the inode after + // each write. + // Default: 0 + size_t recycle_log_file_num; + // manifest file is rolled over on reaching this limit. // The older manifest file be deleted. // The default value is MAX_INT so that roll-over does not take place. @@ -989,7 +1084,23 @@ struct DBOptions { // large amounts of data (such as xfs's allocsize option). size_t manifest_preallocation_size; - // Data being read from file storage may be buffered in the OS + // Hint the OS that it should not buffer disk I/O. Enabling this + // parameter may improve performance but increases pressure on the + // system cache. + // + // The exact behavior of this parameter is platform dependent. + // + // On POSIX systems, after RocksDB reads data from disk it will + // mark the pages as "unneeded". The operating system may - or may not + // - evict these pages from memory, reducing pressure on the system + // cache. If the disk block is requested again this can result in + // additional disk I/O. + // + // On WINDOWS system, files will be opened in "unbuffered I/O" mode + // which means that data read from the disk will not be cached or + // bufferized. The hardware buffer of the devices may however still + // be used. Memory mapped files are not impacted by this parameter. + // // Default: true bool allow_os_buffer; @@ -1031,6 +1142,22 @@ struct DBOptions { // Default: 0 (disabled) size_t db_write_buffer_size; + // The memory usage of memtable will report to this object. The same object + // can be passed into multiple DBs and it will track the sum of size of all + // the DBs. If the total size of all live memtables of all the DBs exceeds + // a limit, a flush will be triggered in the next DB to which the next write + // is issued. + // + // If the object is only passed to on DB, the behavior is the same as + // db_write_buffer_size. When write_buffer_manager is set, the value set will + // override db_write_buffer_size. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: null + std::shared_ptr write_buffer_manager; + // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL @@ -1064,6 +1191,31 @@ struct DBOptions { // Default: 0 size_t compaction_readahead_size; + // This is a maximum buffer size that is used by WinMmapReadableFile in + // unbuffered disk I/O mode. We need to maintain an aligned buffer for + // reads. We allow the buffer to grow until the specified value and then + // for bigger requests allocate one shot buffers. In unbuffered mode we + // always bypass read-ahead buffer at ReadaheadRandomAccessFile + // When read-ahead is required we then make use of compaction_readahead_size + // value and always try to read ahead. With read-ahead we always + // pre-allocate buffer to the size instead of growing it up to a limit. + // + // This option is currently honored only on Windows + // + // Default: 1 Mb + // + // Special value: 0 - means do not maintain per instance buffer. Allocate + // per request buffer and avoid locking. + size_t random_access_max_buffer_size; + + // This is the maximum buffer size that is used by WritableFileWriter. + // On Windows, we need to maintain an aligned buffer for writes. + // We allow the buffer to grow until it's size hits the limit. + // + // Default: 1024 * 1024 (1 MB) + size_t writable_file_max_buffer_size; + + // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not // heavily contended. However, if the mutex is hot, we could end up @@ -1106,14 +1258,54 @@ struct DBOptions { // Default: false bool enable_thread_tracking; - // The limited write rate to DB if soft_rate_limit or - // level0_slowdown_writes_trigger is triggered. It is calculated using - // size of user write requests before compression. + // The limited write rate to DB if soft_pending_compaction_bytes_limit or + // level0_slowdown_writes_trigger is triggered, or we are writing to the + // last mem table allowed and we allow more than 3 mem tables. It is + // calculated using size of user write requests before compression. + // RocksDB may decide to slow down more if the compaction still + // gets behind further. // Unit: byte per second. // - // Default: 1MB/s + // Default: 2MB/s uint64_t delayed_write_rate; + // If true, allow multi-writers to update mem tables in parallel. + // Only some memtable_factory-s support concurrent writes; currently it + // is implemented only for SkipListFactory. Concurrent memtable writes + // are not compatible with inplace_update_support or filter_deletes. + // It is strongly recommended to set enable_write_thread_adaptive_yield + // if you are going to use this feature. + // + // Default: false + bool allow_concurrent_memtable_write; + + // If true, threads synchronizing with the write batch group leader will + // wait for up to write_thread_max_yield_usec before blocking on a mutex. + // This can substantially improve throughput for concurrent workloads, + // regardless of whether allow_concurrent_memtable_write is enabled. + // + // Default: false + bool enable_write_thread_adaptive_yield; + + // The maximum number of microseconds that a write operation will use + // a yielding spin loop to coordinate with other write threads before + // blocking on a mutex. (Assuming write_thread_slow_yield_usec is + // set properly) increasing this value is likely to increase RocksDB + // throughput at the expense of increased CPU usage. + // + // Default: 100 + uint64_t write_thread_max_yield_usec; + + // The latency in microseconds after which a std::this_thread::yield + // call (sched_yield on Linux) is considered to be a signal that + // other processes or threads would like to use the current core. + // Increasing this makes writer threads more likely to take CPU + // by spinning, which will show up as an increase in the number of + // involuntary context switches. + // + // Default: 3 + uint64_t write_thread_slow_yield_usec; + // If true, then DB::Open() will not update the statistics used to optimize // compaction decision by loading table properties from many files. // Turning off this feature will improve DBOpen time especially in @@ -1123,13 +1315,47 @@ struct DBOptions { bool skip_stats_update_on_db_open; // Recovery mode to control the consistency while replaying WAL - // Default: kTolerateCorruptedTailRecords + // Default: kPointInTimeRecovery WALRecoveryMode wal_recovery_mode; + // if set to false then recovery will fail when a prepared + // transaction is encountered in the WAL + bool allow_2pc = false; + // A global cache for table-level rows. // Default: nullptr (disabled) // Not supported in ROCKSDB_LITE mode! std::shared_ptr row_cache; + +#ifndef ROCKSDB_LITE + // A filter object supplied to be invoked while processing write-ahead-logs + // (WALs) during recovery. The filter provides a way to inspect log + // records, ignoring a particular record or skipping replay. + // The filter is invoked at startup and is invoked from a single-thread + // currently. + WalFilter* wal_filter; +#endif // ROCKSDB_LITE + + // If true, then DB::Open / CreateColumnFamily / DropColumnFamily + // / SetOptions will fail if options file is not detected or properly + // persisted. + // + // DEFAULT: false + bool fail_if_options_file_error; + + // If true, then print malloc stats together with rocksdb.stats + // when printing to LOG. + // DEFAULT: false + bool dump_malloc_stats; + + // By default RocksDB replay WAL logs and flush them on DB open, which may + // create very small SST files. If this option is enabled, RocksDB will try + // to avoid (but not guarantee not to) flush during recovery. Also, existing + // WAL logs will be kept, so that if crash happened before flush, we still + // have logs to recover from. + // + // DEFAULT: false + bool avoid_flush_during_recovery; }; // Options to control the behavior of a database (passed to DB::Open) @@ -1141,10 +1367,16 @@ struct Options : public DBOptions, public ColumnFamilyOptions { const ColumnFamilyOptions& column_family_options) : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} + // The function recovers options to the option as in version 4.6. + Options* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + void Dump(Logger* log) const; void DumpCFOptions(Logger* log) const; + // Some functions that make it easier to optimize RocksDB + // Set appropriate parameters for bulk loading. // The reason that this is a function that returns "this" instead of a // constructor is to enable chaining of multiple similar calls in the future. @@ -1154,6 +1386,10 @@ struct Options : public DBOptions, public ColumnFamilyOptions { // It's recommended to manually call CompactRange(NULL, NULL) before reading // from the database, because otherwise the read can be very slow. Options* PrepareForBulkLoad(); + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + Options* OptimizeForSmallDb(); }; // @@ -1164,8 +1400,12 @@ struct Options : public DBOptions, public ColumnFamilyOptions { // the block cache. It will not page in data from the OS cache or data that // resides in storage. enum ReadTier { - kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage - kBlockCacheTier = 0x1 // data in memtable or block cache + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1, // data in memtable or block cache + kPersistedTier = 0x2 // persisted data. When WAL is disabled, this option + // will skip data in memtable. + // Note that this ReadTier currently only supports + // Get and MultiGet and does not support iterators. }; // Options that control read operations @@ -1246,8 +1486,39 @@ struct ReadOptions { // Enable a total order seek regardless of index format (e.g. hash index) // used in the table. Some table format (e.g. plain table) may not support // this option. + // If true when calling Get(), we also skip prefix bloom when reading from + // block based table. It provides a way to read exisiting data after + // changing implementation of prefix extractor. bool total_order_seek; + // Enforce that the iterator only iterates over the same prefix as the seek. + // This option is effective only for prefix seeks, i.e. prefix_extractor is + // non-null for the column family and total_order_seek is false. Unlike + // iterate_upper_bound, prefix_same_as_start only works within a prefix + // but in both directions. + // Default: false + bool prefix_same_as_start; + + // Keep the blocks loaded by the iterator pinned in memory as long as the + // iterator is not deleted, If used when reading from tables created with + // BlockBasedTableOptions::use_delta_encoding = false, + // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to + // return 1. + // Default: false + bool pin_data; + + // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we + // schedule a background job in the flush job queue and delete obsolete files + // in background. + // Default: false + bool background_purge_on_iterator_cleanup; + + // If non-zero, NewIterator will create a new table reader which + // performs reads of the given size. Using a large size (> 2MB) can + // improve the performance of forward iteration on spinning disks. + // Default: 0 + size_t readahead_size; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; @@ -1301,16 +1572,10 @@ struct FlushOptions { FlushOptions() : wait(true) {} }; -// Get options based on some guidelines. Now only tune parameter based on -// flush/compaction and fill default parameters for other parameters. -// total_write_buffer_limit: budget for memory spent for mem tables -// read_amplification_threshold: comfortable value of read amplification -// write_amplification_threshold: comfortable value of write amplification. -// target_db_size: estimated total DB size. -extern Options GetOptions(size_t total_write_buffer_limit, - int read_amplification_threshold = 8, - int write_amplification_threshold = 32, - uint64_t target_db_size = 68719476736 /* 64GB */); +// Create a Logger from provided DBOptions +extern Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger); // CompactionOptions are used in CompactFiles() call. struct CompactionOptions { @@ -1340,6 +1605,9 @@ enum class BottommostLevelCompaction { // CompactRangeOptions is used by CompactRange() call. struct CompactRangeOptions { + // If true, no other compaction will run at the same time as this + // manual compaction + bool exclusive_manual_compaction = true; // If true, compacted files will be moved to the minimum level capable // of holding the data or given level (specified non-negative target_level). bool change_level = false; @@ -1354,6 +1622,7 @@ struct CompactRangeOptions { BottommostLevelCompaction bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter; }; + } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/external/rocksdb/include/rocksdb/perf_context.h b/external/rocksdb/include/rocksdb/perf_context.h index a7c993c7b5..1d73fecaa1 100644 --- a/external/rocksdb/include/rocksdb/perf_context.h +++ b/external/rocksdb/include/rocksdb/perf_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,48 +21,70 @@ struct PerfContext { void Reset(); // reset all performance counters to zero - std::string ToString() const; + std::string ToString(bool exclude_zero_counters = false) const; uint64_t user_key_comparison_count; // total number of user key comparisons uint64_t block_cache_hit_count; // total number of block cache hits uint64_t block_read_count; // total number of block reads (with IO) uint64_t block_read_byte; // total number of bytes from block reads - uint64_t block_read_time; // total time spent on block reads - uint64_t block_checksum_time; // total time spent on block checksum - uint64_t block_decompress_time; // total time spent on block decompression - // total number of internal keys skipped over during iteration (overwritten or - // deleted, to be more specific, hidden by a put or delete of the same key) + uint64_t block_read_time; // total nanos spent on block reads + uint64_t block_checksum_time; // total nanos spent on block checksum + uint64_t block_decompress_time; // total nanos spent on block decompression + // total number of internal keys skipped over during iteration. + // There are several reasons for it: + // 1. when calling Next(), the iterator is in the position of the previous + // key, so that we'll need to skip it. It means this counter will always + // be incremented in Next(). + // 2. when calling Next(), we need to skip internal entries for the previous + // keys that are overwritten. + // 3. when calling Next(), Seek() or SeekToFirst(), after previous key + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next + // valid key that the operation should place the iterator to. We need + // to skip both of the tombstone and updates hidden by the tombstones. The + // tombstones are not included in this counter, while previous updates + // hidden by the tombstones will be included here. + // 4. symmetric cases for Prev() and SeekToLast() + // We sometimes also skip entries of more recent updates than the snapshot + // we read from, but they are not included in this counter. + // uint64_t internal_key_skipped_count; - // total number of deletes and single deletes skipped over during iteration + // Total number of deletes and single deletes skipped over during iteration + // When calling Next(), Seek() or SeekToFirst(), after previous position + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next valid + // key. Every deleted key is counted once. We don't recount here if there are + // still older updates invalidated by the tombstones. + // uint64_t internal_delete_skipped_count; - uint64_t get_snapshot_time; // total time spent on getting snapshot - uint64_t get_from_memtable_time; // total time spent on querying memtables + uint64_t get_snapshot_time; // total nanos spent on getting snapshot + uint64_t get_from_memtable_time; // total nanos spent on querying memtables uint64_t get_from_memtable_count; // number of mem tables queried - // total time spent after Get() finds a key + // total nanos spent after Get() finds a key uint64_t get_post_process_time; - uint64_t get_from_output_files_time; // total time reading from output files - // total time spent on seeking memtable + uint64_t get_from_output_files_time; // total nanos reading from output files + // total nanos spent on seeking memtable uint64_t seek_on_memtable_time; // number of seeks issued on memtable uint64_t seek_on_memtable_count; - // total time spent on seeking child iters + // total nanos spent on seeking child iters uint64_t seek_child_seek_time; // number of seek issued in child iterators uint64_t seek_child_seek_count; - uint64_t seek_min_heap_time; // total time spent on the merge heap - // total time spent on seeking the internal entries + uint64_t seek_min_heap_time; // total nanos spent on the merge heap + // total nanos spent on seeking the internal entries uint64_t seek_internal_seek_time; - // total time spent on iterating internal entries to find the next user entry + // total nanos spent on iterating internal entries to find the next user entry uint64_t find_next_user_entry_time; - // total time spent on writing to WAL + // total nanos spent on writing to WAL uint64_t write_wal_time; - // total time spent on writing to mem tables + // total nanos spent on writing to mem tables uint64_t write_memtable_time; - // total time spent on delaying write + // total nanos spent on delaying write uint64_t write_delay_time; - // total time spent on writing a record, excluding the above three times + // total nanos spent on writing a record, excluding the above three times uint64_t write_pre_and_post_process_time; uint64_t db_mutex_lock_nanos; // time spent on acquiring DB mutex. diff --git a/external/rocksdb/include/rocksdb/perf_level.h b/external/rocksdb/include/rocksdb/perf_level.h index fee8ce1c43..5fddac57c2 100644 --- a/external/rocksdb/include/rocksdb/perf_level.h +++ b/external/rocksdb/include/rocksdb/perf_level.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,10 +13,14 @@ namespace rocksdb { // How much perf stats to collect. Affects perf_context and iostats_context. -enum PerfLevel { - kDisable = 0, // disable perf stats - kEnableCount = 1, // enable only count stats - kEnableTime = 2 // enable time stats too +enum PerfLevel : unsigned char { + kUninitialized = 0, // unknown setting + kDisable = 1, // disable perf stats + kEnableCount = 2, // enable only count stats + kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time + // stats except for mutexes + kEnableTime = 4, // enable count and time stats + kOutOfBounds = 5 // N.B. Must always be the last value! }; // set the perf stats level for current thread diff --git a/external/rocksdb/include/rocksdb/persistent_cache.h b/external/rocksdb/include/rocksdb/persistent_cache.h new file mode 100644 index 0000000000..ef49da5ab6 --- /dev/null +++ b/external/rocksdb/include/rocksdb/persistent_cache.h @@ -0,0 +1,49 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// PersistentCache +// +// Persistent cache interface for caching IO pages on a persistent medium. The +// cache interface is specifically designed for persistent read cache. +class PersistentCache { + public: + virtual ~PersistentCache() {} + + // Insert to page cache + // + // page_key Identifier to identify a page uniquely across restarts + // data Page data + // size Size of the page + virtual Status Insert(const Slice& key, const char* data, + const size_t size) = 0; + + // Lookup page cache by page identifier + // + // page_key Page identifier + // buf Buffer where the data should be copied + // size Size of the page + virtual Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) = 0; + + // Is cache storing uncompressed data ? + // + // True if the cache is configured to store uncompressed data else false + virtual bool IsCompressed() = 0; +}; + +} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/rate_limiter.h b/external/rocksdb/include/rocksdb/rate_limiter.h index ae3ab8f847..b1bf3f4271 100644 --- a/external/rocksdb/include/rocksdb/rate_limiter.h +++ b/external/rocksdb/include/rocksdb/rate_limiter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/slice.h b/external/rocksdb/include/rocksdb/slice.h index ae3139cfd6..38d494ed98 100644 --- a/external/rocksdb/include/rocksdb/slice.h +++ b/external/rocksdb/include/rocksdb/slice.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -73,9 +73,22 @@ class Slice { size_ -= n; } + void remove_suffix(size_t n) { + assert(n <= size()); + size_ -= n; + } + // Return a string that contains the copy of the referenced data. + // when hex is true, returns a string of twice the length hex encoded (0-9A-F) std::string ToString(bool hex = false) const; + // Decodes the current slice interpreted as an hexadecimal string into result, + // if successful returns true, if this isn't a valid hex string + // (e.g not coming from Slice::ToString(true)) DecodeHex returns false. + // This slice is expected to have an even number of 0-9A-F characters + // also accepts lowercase (a-f) + bool DecodeHex(std::string* result) const; + // Three-way comparison. Returns value: // < 0 iff "*this" < "b", // == 0 iff "*this" == "b", @@ -88,6 +101,11 @@ class Slice { (memcmp(data_, x.data_, x.size_) == 0)); } + bool ends_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0)); + } + // Compare two slices and returns the first byte where they differ size_t difference_offset(const Slice& b) const; diff --git a/external/rocksdb/include/rocksdb/slice_transform.h b/external/rocksdb/include/rocksdb/slice_transform.h index 3694c58022..d123258121 100644 --- a/external/rocksdb/include/rocksdb/slice_transform.h +++ b/external/rocksdb/include/rocksdb/slice_transform.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/snapshot.h b/external/rocksdb/include/rocksdb/snapshot.h index aad675b4b2..d8d999dc25 100644 --- a/external/rocksdb/include/rocksdb/snapshot.h +++ b/external/rocksdb/include/rocksdb/snapshot.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,6 +33,9 @@ class ManagedSnapshot { public: explicit ManagedSnapshot(DB* db); + // Instead of creating a snapshot, take ownership of the input snapshot. + ManagedSnapshot(DB* db, const Snapshot* _snapshot); + ~ManagedSnapshot(); const Snapshot* snapshot(); diff --git a/external/rocksdb/include/rocksdb/sst_dump_tool.h b/external/rocksdb/include/rocksdb/sst_dump_tool.h index 39bfb519b2..0dd94caba0 100644 --- a/external/rocksdb/include/rocksdb/sst_dump_tool.h +++ b/external/rocksdb/include/rocksdb/sst_dump_tool.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/sst_file_manager.h b/external/rocksdb/include/rocksdb/sst_file_manager.h new file mode 100644 index 0000000000..bee243e4a9 --- /dev/null +++ b/external/rocksdb/include/rocksdb/sst_file_manager.h @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include + +#include "rocksdb/status.h" + +namespace rocksdb { + +class Env; +class Logger; + +// SstFileManager is used to track SST files in the DB and control there +// deletion rate. +// All SstFileManager public functions are thread-safe. +class SstFileManager { + public: + virtual ~SstFileManager() {} + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST files exceeds max_allowed_space, writes to + // RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature, maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0; + + // Return true if the total size of SST files exceeded the maximum allowed + // space usage. + // + // thread-safe. + virtual bool IsMaxAllowedSpaceReached() = 0; + + // Return the total size of all tracked files. + // thread-safe + virtual uint64_t GetTotalSize() = 0; + + // Return a map containing all tracked files and there corresponding sizes. + // thread-safe + virtual std::unordered_map GetTrackedFiles() = 0; + + // Return delete rate limit in bytes per second. + // thread-safe + virtual int64_t GetDeleteRateBytesPerSecond() = 0; +}; + +// Create a new SstFileManager that can be shared among multiple RocksDB +// instances to track SST file and control there deletion rate. +// +// @param env: Pointer to Env object, please see "rocksdb/env.h". +// @param info_log: If not nullptr, info_log will be used to log errors. +// +// == Deletion rate limiting specific arguments == +// @param trash_dir: Path to the directory where deleted files will be moved +// to be deleted in a background thread while applying rate limiting. If this +// directory dont exist, it will be created. This directory should not be +// used by any other process or any other SstFileManager, Set to "" to +// disable deletion rate limiting. +// @param rate_bytes_per_sec: How many bytes should be deleted per second, If +// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb +// in 1 second, we will wait for another 3 seconds before we delete other +// files, Set to 0 to disable deletion rate limiting. +// @param delete_exisitng_trash: If set to true, the newly created +// SstFileManager will delete files that already exist in trash_dir. +// @param status: If not nullptr, status will contain any errors that happened +// during creating the missing trash_dir or deleting existing files in trash. +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr info_log = nullptr, + std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_exisitng_trash = true, Status* status = nullptr); + +} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/sst_file_writer.h b/external/rocksdb/include/rocksdb/sst_file_writer.h index eb2f894912..530bed186e 100644 --- a/external/rocksdb/include/rocksdb/sst_file_writer.h +++ b/external/rocksdb/include/rocksdb/sst_file_writer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -49,8 +49,7 @@ struct ExternalSstFileInfo { // All keys in files generated by SstFileWriter will have sequence number = 0 class SstFileWriter { public: - SstFileWriter(const EnvOptions& env_options, - const ImmutableCFOptions& ioptions, + SstFileWriter(const EnvOptions& env_options, const Options& options, const Comparator* user_comparator); ~SstFileWriter(); diff --git a/external/rocksdb/include/rocksdb/statistics.h b/external/rocksdb/include/rocksdb/statistics.h index 9a21fe1746..16fba0906b 100644 --- a/external/rocksdb/include/rocksdb/statistics.h +++ b/external/rocksdb/include/rocksdb/statistics.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,14 +33,24 @@ enum Tickers : uint32_t { BLOCK_CACHE_HIT, // # of blocks added to block cache. BLOCK_CACHE_ADD, + // # of failures when adding blocks to block cache. + BLOCK_CACHE_ADD_FAILURES, // # of times cache miss when accessing index block from block cache. BLOCK_CACHE_INDEX_MISS, // # of times cache hit when accessing index block from block cache. BLOCK_CACHE_INDEX_HIT, + // # of bytes of index blocks inserted into cache + BLOCK_CACHE_INDEX_BYTES_INSERT, + // # of bytes of index block erased from cache + BLOCK_CACHE_INDEX_BYTES_EVICT, // # of times cache miss when accessing filter block from block cache. BLOCK_CACHE_FILTER_MISS, // # of times cache hit when accessing filter block from block cache. BLOCK_CACHE_FILTER_HIT, + // # of bytes of bloom filter blocks inserted into cache + BLOCK_CACHE_FILTER_BYTES_INSERT, + // # of bytes of bloom filter block erased from cache + BLOCK_CACHE_FILTER_BYTES_EVICT, // # of times cache miss when accessing data block from block cache. BLOCK_CACHE_DATA_MISS, // # of times cache hit when accessing data block from block cache. @@ -49,9 +59,15 @@ enum Tickers : uint32_t { BLOCK_CACHE_BYTES_READ, // # of bytes written into cache. BLOCK_CACHE_BYTES_WRITE, + // # of times bloom filter has avoided file reads. BLOOM_FILTER_USEFUL, + // # persistent cache hit + PERSISTENT_CACHE_HIT, + // # persistent cache miss + PERSISTENT_CACHE_MISS, + // # of memtable hits. MEMTABLE_HIT, // # of memtable misses. @@ -109,6 +125,7 @@ enum Tickers : uint32_t { // Writer has to wait for compaction or flush to finish. STALL_MICROS, // The wait time for db mutex. + // Disabled by default. To enable it set stats level to kAll DB_MUTEX_WAIT_MICROS, RATE_LIMIT_DELAY_MILLIS, NO_ITERATORS, // number of iterators currently open @@ -139,13 +156,17 @@ enum Tickers : uint32_t { GET_UPDATES_SINCE_CALLS, BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache - WAL_FILE_SYNCED, // Number of times WAL sync is done - WAL_FILE_BYTES, // Number of bytes written to WAL + // Number of blocks added to comopressed block cache + BLOCK_CACHE_COMPRESSED_ADD, + // Number of failures when adding blocks to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL // Writes can be processed by requesting thread or by the thread at the // head of the writers queue. WRITE_DONE_BY_SELF, - WRITE_DONE_BY_OTHER, + WRITE_DONE_BY_OTHER, // Equivalent to writes done for others WRITE_TIMEDOUT, // Number of writes ending up with timed-out. WRITE_WITH_WAL, // Number of Write calls that request WAL COMPACT_READ_BYTES, // Bytes read during compaction @@ -158,6 +179,11 @@ enum Tickers : uint32_t { NUMBER_SUPERVERSION_ACQUIRES, NUMBER_SUPERVERSION_RELEASES, NUMBER_SUPERVERSION_CLEANUPS, + + // # of compressions/decompressions executed + NUMBER_BLOCK_COMPRESSED, + NUMBER_BLOCK_DECOMPRESSED, + NUMBER_BLOCK_NOT_COMPRESSED, MERGE_OPERATION_TOTAL_TIME, FILTER_OPERATION_TOTAL_TIME, @@ -175,15 +201,23 @@ const std::vector> TickersNameMap = { {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"}, {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"}, {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"}, + {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"}, {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"}, {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, + {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"}, + {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"}, {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, + {BLOCK_CACHE_FILTER_BYTES_INSERT, + "rocksdb.block.cache.filter.bytes.insert"}, + {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"}, {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"}, {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"}, {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, + {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"}, + {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"}, {MEMTABLE_HIT, "rocksdb.memtable.hit"}, {MEMTABLE_MISS, "rocksdb.memtable.miss"}, {GET_HIT_L0, "rocksdb.l0.hit"}, @@ -226,19 +260,25 @@ const std::vector> TickersNameMap = { {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"}, {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"}, + {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"}, + {BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + "rocksdb.block.cachecompressed.add.failures"}, {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, {WRITE_DONE_BY_OTHER, "rocksdb.write.other"}, + {WRITE_TIMEDOUT, "rocksdb.write.timeout"}, {WRITE_WITH_WAL, "rocksdb.write.wal"}, - {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"}, {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, + {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"}, {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, "rocksdb.number.direct.load.table.properties"}, {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"}, {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"}, {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"}, + {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"}, + {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"}, {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"}, {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"}, {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"}, @@ -279,6 +319,18 @@ enum Histograms : uint32_t { SST_READ_MICROS, // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, + // Value size distribution in each operation + BYTES_PER_READ, + BYTES_PER_WRITE, + BYTES_PER_MULTIGET, + + // number of bytes compressed/decompressed + // number of bytes is when uncompressed; i.e. before/after respectively + BYTES_COMPRESSED, + BYTES_DECOMPRESSED, + COMPRESSION_TIMES_NANOS, + DECOMPRESSION_TIMES_NANOS, + HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match }; @@ -306,6 +358,13 @@ const std::vector> HistogramsNameMap = { {WRITE_STALL, "rocksdb.db.write.stall"}, {SST_READ_MICROS, "rocksdb.sst.read.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, + {BYTES_PER_READ, "rocksdb.bytes.per.read"}, + {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, + {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"}, + {BYTES_COMPRESSED, "rocksdb.bytes.compressed"}, + {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"}, + {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"}, + {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"}, }; struct HistogramData { @@ -316,6 +375,19 @@ struct HistogramData { double standard_deviation; }; +enum StatsLevel { + // Collect all stats except the counters requiring to get time inside the + // mutex lock. + kExceptTimeForMutex, + // Collect all stats expect time inside mutex lock AND time spent on + // compression + kExceptDetailedTimers, + // Collect all stats, including measuring duration of mutex operations. + // If getting time is expensive on the platform to run, it can + // reduce scalability to more threads, especially for writes. + kAll, +}; + // Analyze the performance of a db class Statistics { public: @@ -339,6 +411,8 @@ class Statistics { virtual bool HistEnabledForType(uint32_t type) const { return type < HISTOGRAM_ENUM_MAX; } + + StatsLevel stats_level_ = kExceptTimeForMutex; }; // Create a concrete DBStatistics object diff --git a/external/rocksdb/include/rocksdb/status.h b/external/rocksdb/include/rocksdb/status.h index e8e7970ccf..bff15ee0f7 100644 --- a/external/rocksdb/include/rocksdb/status.h +++ b/external/rocksdb/include/rocksdb/status.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -30,7 +30,17 @@ class Status { // Copy the specified status. Status(const Status& s); - void operator=(const Status& s); + Status& operator=(const Status& s); + Status(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + ; + Status& operator=(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + ; bool operator==(const Status& rhs) const; bool operator!=(const Status& rhs) const; @@ -48,7 +58,7 @@ class Status { kAborted = 10, kBusy = 11, kExpired = 12, - kTryAgain = 13 + kTryAgain = 13, }; Code code() const { return code_; } @@ -214,15 +224,41 @@ class Status { inline Status::Status(const Status& s) : code_(s.code_), subcode_(s.subcode_) { state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } -inline void Status::operator=(const Status& s) { +inline Status& Status::operator=(const Status& s) { // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. - code_ = s.code_; - subcode_ = s.subcode_; - if (state_ != s.state_) { + if(this != &s) { + code_ = s.code_; + subcode_ = s.subcode_; delete[] state_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } + return *this; +} + +inline Status::Status(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) +noexcept +#endif + : Status() { + *this = std::move(s); +} + +inline Status& Status::operator=(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) +noexcept +#endif +{ + if(this != &s) { + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + delete [] state_; + state_ = nullptr; + std::swap(state_, s.state_); + } + return *this; } inline bool Status::operator==(const Status& rhs) const { diff --git a/external/rocksdb/include/rocksdb/table.h b/external/rocksdb/include/rocksdb/table.h index e52b580995..3332753165 100644 --- a/external/rocksdb/include/rocksdb/table.h +++ b/external/rocksdb/include/rocksdb/table.h @@ -21,15 +21,16 @@ #include #include "rocksdb/env.h" +#include "rocksdb/immutable_options.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" #include "rocksdb/status.h" namespace rocksdb { // -- Block-based Table class FlushBlockPolicyFactory; +class PersistentCache; class RandomAccessFile; struct TableReaderOptions; struct TableBuilderOptions; @@ -64,6 +65,12 @@ struct BlockBasedTableOptions { // block during table initialization. bool cache_index_and_filter_blocks = false; + // if cache_index_and_filter_blocks is true and the below is true, then + // filter and index blocks are stored in the cache, but a reference is + // held in the "table reader" object so the blocks are pinned and only + // evicted from cache when the table reader is freed. + bool pin_l0_filter_and_index_blocks_in_cache = false; + // The index type that will be used for this table. enum IndexType : char { // A space efficient index block that is optimized for @@ -77,10 +84,8 @@ struct BlockBasedTableOptions { IndexType index_type = kBinarySearch; - // Influence the behavior when kHashSearch is used. - // if false, stores a precise prefix to block range mapping - // if true, does not store prefix and allows prefix hash collision - // (less memory consumption) + // This option is now deprecated. No matter what value it is set to, + // it will behave as if hash_index_allow_collision=true. bool hash_index_allow_collision = true; // Use the specified checksum type. Newly created table files will be @@ -97,6 +102,10 @@ struct BlockBasedTableOptions { // If NULL, rocksdb will automatically create and use an 8MB internal cache. std::shared_ptr block_cache = nullptr; + // If non-NULL use the specified cache for pages read from device + // IF NULL, no page cache is used + std::shared_ptr persistent_cache = nullptr; + // If non-NULL use the specified cache for compressed blocks. // If NULL, rocksdb will not use a compressed block cache. std::shared_ptr block_cache_compressed = nullptr; @@ -116,9 +125,19 @@ struct BlockBasedTableOptions { // Number of keys between restart points for delta encoding of keys. // This parameter can be changed dynamically. Most clients should - // leave this parameter alone. + // leave this parameter alone. The minimum value allowed is 1. Any smaller + // value will be silently overwritten with 1. int block_restart_interval = 16; + // Same as block_restart_interval but used for the index block. + int index_block_restart_interval = 1; + + // Use delta encoding to compress keys in blocks. + // ReadOptions::pin_data requires this option to be disabled. + // + // Default: true + bool use_delta_encoding = true; + // If non-nullptr, use the specified filter policy to reduce disk reads. // Many applications will benefit from passing the result of // NewBloomFilterPolicy() here. @@ -128,6 +147,25 @@ struct BlockBasedTableOptions { // This must generally be true for gets to be efficient. bool whole_key_filtering = true; + // If true, block will not be explicitly flushed to disk during building + // a SstTable. Instead, buffer in WritableFileWriter will take + // care of the flushing when it is full. + // + // On Windows, this option helps a lot when unbuffered I/O + // (allow_os_buffer = false) is used, since it avoids small + // unbuffered disk write. + // + // User may also adjust writable_file_max_buffer_size to optimize disk I/O + // size. + // + // Default: false + bool skip_table_builder_flush = false; + + // Verify that decompressing the compressed block gives back the input. This + // is a verification mode that we use to detect bugs in compression + // algorithms. + bool verify_compression = false; + // We currently have three versions: // 0 -- This version is currently written out by all RocksDB's versions by // default. Can be read by really old RocksDB's. Doesn't support changing @@ -142,7 +180,7 @@ struct BlockBasedTableOptions { // this. // This option only affects newly written tables. When reading exising tables, // the information about version is read from the footer. - uint32_t format_version = 0; + uint32_t format_version = 2; }; // Table Properties that are specific to block-based table properties. @@ -352,7 +390,8 @@ class TableFactory { virtual Status NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const = 0; + unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const = 0; // Return a table builder to write to a file for this table type. // @@ -375,7 +414,7 @@ class TableFactory { // to use in this table. virtual TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFileWriter* file) const = 0; + uint32_t column_family_id, WritableFileWriter* file) const = 0; // Sanitizes the specified DB Options and ColumnFamilyOptions. // @@ -388,6 +427,22 @@ class TableFactory { // Return a string that contains printable format of table configurations. // RocksDB prints configurations at DB Open(). virtual std::string GetPrintableTableOptions() const = 0; + + // Returns the raw pointer of the table options that is used by this + // TableFactory, or nullptr if this function is not supported. + // Since the return value is a raw pointer, the TableFactory owns the + // pointer and the caller should not delete the pointer. + // + // In certan case, it is desirable to alter the underlying options when the + // TableFactory is not used by any open DB by casting the returned pointer + // to the right class. For instance, if BlockBasedTableFactory is used, + // then the pointer can be casted to BlockBasedTableOptions. + // + // Note that changing the underlying TableFactory options while the + // TableFactory is currently used by any open DB is undefined behavior. + // Developers should use DB::SetOption() instead to dynamically change + // options while the DB is open. + virtual void* GetOptions() { return nullptr; } }; #ifndef ROCKSDB_LITE diff --git a/external/rocksdb/include/rocksdb/table_properties.h b/external/rocksdb/include/rocksdb/table_properties.h index 28500749a4..ee95e84f77 100644 --- a/external/rocksdb/include/rocksdb/table_properties.h +++ b/external/rocksdb/include/rocksdb/table_properties.h @@ -27,46 +27,6 @@ namespace rocksdb { // } typedef std::map UserCollectedProperties; -// TableProperties contains a bunch of read-only properties of its associated -// table. -struct TableProperties { - public: - // the total size of all data blocks. - uint64_t data_size = 0; - // the size of index block. - uint64_t index_size = 0; - // the size of filter block. - uint64_t filter_size = 0; - // total raw key size - uint64_t raw_key_size = 0; - // total raw value size - uint64_t raw_value_size = 0; - // the number of blocks in this table - uint64_t num_data_blocks = 0; - // the number of entries in this table - uint64_t num_entries = 0; - // format version, reserved for backward compatibility - uint64_t format_version = 0; - // If 0, key is variable length. Otherwise number of bytes for each key. - uint64_t fixed_key_len = 0; - - // The name of the filter policy used in this table. - // If no filter policy is used, `filter_policy_name` will be an empty string. - std::string filter_policy_name; - - // user collected properties - UserCollectedProperties user_collected_properties; - - // convert this object to a human readable form - // @prop_delim: delimiter for each property. - std::string ToString(const std::string& prop_delim = "; ", - const std::string& kv_delim = "=") const; - - // Aggregate the numerical member variables of the specified - // TableProperties. - void Add(const TableProperties& tp); -}; - // table properties' human-readable names in the property block. struct TablePropertiesNames { static const std::string kDataSize; @@ -79,9 +39,16 @@ struct TablePropertiesNames { static const std::string kFormatVersion; static const std::string kFixedKeyLen; static const std::string kFilterPolicy; + static const std::string kColumnFamilyName; + static const std::string kColumnFamilyId; + static const std::string kComparator; + static const std::string kMergeOperator; + static const std::string kPropertyCollectors; + static const std::string kCompression; }; extern const std::string kPropertiesBlock; +extern const std::string kCompressionDictBlock; enum EntryType { kEntryPut, @@ -106,7 +73,7 @@ class TablePropertiesCollector { // Add() will be called when a new key/value pair is inserted into the table. // @params key the user key that is inserted into the table. // @params value the value that is inserted into the table. - virtual Status Add(const Slice& key, const Slice& value) { + virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) { return Status::InvalidArgument( "TablePropertiesCollector::Add() deprecated."); } @@ -115,10 +82,9 @@ class TablePropertiesCollector { // table. // @params key the user key that is inserted into the table. // @params value the value that is inserted into the table. - // @params file_size file size up to now virtual Status AddUserKey(const Slice& key, const Slice& value, - EntryType type, SequenceNumber seq, - uint64_t file_size) { + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) { // For backwards-compatibility. return Add(key, value); } @@ -144,18 +110,90 @@ class TablePropertiesCollector { // TablePropertiesCollector for each new table class TablePropertiesCollectorFactory { public: + struct Context { + uint32_t column_family_id; + static const uint32_t kUnknownColumnFamily; + }; + virtual ~TablePropertiesCollectorFactory() {} // has to be thread-safe - virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0; + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) = 0; // The name of the properties collector can be used for debugging purpose. virtual const char* Name() const = 0; }; +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw key size + uint64_t raw_key_size = 0; + // total raw value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; + // ID of column family for this SST file, corresponding to the CF identified + // by column_family_name. + uint64_t column_family_id = + rocksdb::TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + + // Name of the column family with which this SST file is associated. + // If column family is unknown, `column_family_name` will be an empty string. + std::string column_family_name; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // The name of the comparator used in this table. + std::string comparator_name; + + // The name of the merge operator used in this table. + // If no merge operator is used, `merge_operator_name` will be "nullptr". + std::string merge_operator_name; + + // The names of the property collectors factories used in this table + // separated by commas + // {collector_name[1]},{collector_name[2]},{collector_name[3]} .. + std::string property_collectors_names; + + // The compression algo used to compress the SST files. + std::string compression_name; + + // user collected properties + UserCollectedProperties user_collected_properties; + UserCollectedProperties readable_properties; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; + + // Aggregate the numerical member variables of the specified + // TableProperties. + void Add(const TableProperties& tp); +}; + // Extra properties // Below is a list of non-basic properties that are collected by database // itself. Especially some properties regarding to the internal keys (which // is unknown to `table`). extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); +extern uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present); } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/thread_status.h b/external/rocksdb/include/rocksdb/thread_status.h index d8a61b4906..0cdea2b519 100644 --- a/external/rocksdb/include/rocksdb/thread_status.h +++ b/external/rocksdb/include/rocksdb/thread_status.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/transaction_log.h b/external/rocksdb/include/rocksdb/transaction_log.h index 1b80b9a0c1..1fb93ace16 100644 --- a/external/rocksdb/include/rocksdb/transaction_log.h +++ b/external/rocksdb/include/rocksdb/transaction_log.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/types.h b/external/rocksdb/include/rocksdb/types.h index f20bf8277f..6a477cab89 100644 --- a/external/rocksdb/include/rocksdb/types.h +++ b/external/rocksdb/include/rocksdb/types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/universal_compaction.h b/external/rocksdb/include/rocksdb/universal_compaction.h index e0f9f830f3..11490e413a 100644 --- a/external/rocksdb/include/rocksdb/universal_compaction.h +++ b/external/rocksdb/include/rocksdb/universal_compaction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/backupable_db.h b/external/rocksdb/include/rocksdb/utilities/backupable_db.h index 5c32750844..27c1b49ac2 100644 --- a/external/rocksdb/include/rocksdb/utilities/backupable_db.h +++ b/external/rocksdb/include/rocksdb/utilities/backupable_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -75,11 +75,21 @@ struct BackupableDBOptions { // Default: 0 uint64_t backup_rate_limit; + // Backup rate limiter. Used to control transfer speed for backup. If this is + // not null, backup_rate_limit is ignored. + // Default: nullptr + std::shared_ptr backup_rate_limiter{nullptr}; + // Max bytes that can be transferred in a second during restore. // If 0, go as fast as you can // Default: 0 uint64_t restore_rate_limit; + // Restore rate limiter. Used to control transfer speed during restore. If + // this is not null, restore_rate_limit is ignored. + // Default: nullptr + std::shared_ptr restore_rate_limiter{nullptr}; + // Only used if share_table_files is set to true. If true, will consider that // backups can come from different databases, hence a sst is not uniquely // identifed by its name, but by the triple (file name, crc32, file length) @@ -143,13 +153,17 @@ struct BackupInfo { uint64_t size; uint32_t number_files; + std::string app_metadata; BackupInfo() {} BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, - uint32_t _number_files) - : backup_id(_backup_id), timestamp(_timestamp), size(_size), - number_files(_number_files) {} + uint32_t _number_files, const std::string& _app_metadata) + : backup_id(_backup_id), + timestamp(_timestamp), + size(_size), + number_files(_number_files), + app_metadata(_app_metadata) {} }; class BackupStatistics { @@ -179,6 +193,8 @@ class BackupStatistics { uint32_t number_fail_backup; }; +// A backup engine for accessing information about backups and restoring from +// them. class BackupEngineReadOnly { public: virtual ~BackupEngineReadOnly() {} @@ -186,9 +202,12 @@ class BackupEngineReadOnly { static Status Open(Env* db_env, const BackupableDBOptions& options, BackupEngineReadOnly** backup_engine_ptr); + // Returns info about backups in backup_info // You can GetBackupInfo safely, even with other BackupEngine performing // backups on the same directory virtual void GetBackupInfo(std::vector* backup_info) = 0; + + // Returns info about corrupt backups in corrupt_backups virtual void GetCorruptedBackups( std::vector* corrupt_backup_ids) = 0; @@ -196,9 +215,12 @@ class BackupEngineReadOnly { // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's // responsibility to synchronize the operation, i.e. don't delete the backup // when you're restoring from it + // See also the corresponding doc in BackupEngine virtual Status RestoreDBFromBackup( BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options = RestoreOptions()) = 0; + + // See the corresponding doc in BackupEngine virtual Status RestoreDBFromLatestBackup( const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options = RestoreOptions()) = 0; @@ -209,62 +231,37 @@ class BackupEngineReadOnly { virtual Status VerifyBackup(BackupID backup_id) = 0; }; -// Please see the documentation in BackupableDB and RestoreBackupableDB +// A backup engine for creating new backups. class BackupEngine { public: virtual ~BackupEngine() {} + // BackupableDBOptions have to be the same as the ones used in previous + // BackupEngines for the same backup directory. static Status Open(Env* db_env, const BackupableDBOptions& options, BackupEngine** backup_engine_ptr); - virtual Status CreateNewBackup( - DB* db, bool flush_before_backup = false, + // same as CreateNewBackup, but stores extra application metadata + virtual Status CreateNewBackupWithMetadata( + DB* db, const std::string& app_metadata, bool flush_before_backup = false, std::function progress_callback = []() {}) = 0; - virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; - virtual Status DeleteBackup(BackupID backup_id) = 0; - virtual void StopBackup() = 0; - - virtual void GetBackupInfo(std::vector* backup_info) = 0; - virtual void GetCorruptedBackups( - std::vector* corrupt_backup_ids) = 0; - virtual Status RestoreDBFromBackup( - BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& restore_options = RestoreOptions()) = 0; - virtual Status RestoreDBFromLatestBackup( - const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& restore_options = RestoreOptions()) = 0; - - // checks that each file exists and that the size of the file matches our - // expectations. it does not check file checksum. - // Returns Status::OK() if all checks are good - virtual Status VerifyBackup(BackupID backup_id) = 0; - - virtual Status GarbageCollect() = 0; -}; - -// Stack your DB with BackupableDB to be able to backup the DB -class BackupableDB : public StackableDB { - public: - // BackupableDBOptions have to be the same as the ones used in a previous - // incarnation of the DB - // - // BackupableDB ownes the pointer `DB* db` now. You should not delete it or - // use it after the invocation of BackupableDB - BackupableDB(DB* db, const BackupableDBOptions& options); - virtual ~BackupableDB(); // Captures the state of the database in the latest backup // NOT a thread safe call - Status CreateNewBackup(bool flush_before_backup = false); - // Returns info about backups in backup_info - void GetBackupInfo(std::vector* backup_info); - // Returns info about corrupt backups in corrupt_backups - void GetCorruptedBackups(std::vector* corrupt_backup_ids); + virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false, + std::function progress_callback = + []() {}) { + return CreateNewBackupWithMetadata(db, "", flush_before_backup, + progress_callback); + } + // deletes old backups, keeping latest num_backups_to_keep alive - Status PurgeOldBackups(uint32_t num_backups_to_keep); + virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + // deletes a specific backup - Status DeleteBackup(BackupID backup_id); + virtual Status DeleteBackup(BackupID backup_id) = 0; + // Call this from another thread if you want to stop the backup // that is currently happening. It will return immediatelly, will // not wait for the backup to stop. @@ -272,62 +269,44 @@ class BackupableDB : public StackableDB { // return Status::Incomplete(). It will not clean up after itself, but // the state will remain consistent. The state will be cleaned up // next time you create BackupableDB or RestoreBackupableDB. - void StopBackup(); - - // Will delete all the files we don't need anymore - // It will do the full scan of the files/ directory and delete all the - // files that are not referenced. - Status GarbageCollect(); - - private: - BackupEngine* backup_engine_; - Status status_; -}; - -// Use this class to access information about backups and restore from them -class RestoreBackupableDB { - public: - RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options); - ~RestoreBackupableDB(); + virtual void StopBackup() = 0; // Returns info about backups in backup_info - void GetBackupInfo(std::vector* backup_info); + virtual void GetBackupInfo(std::vector* backup_info) = 0; + // Returns info about corrupt backups in corrupt_backups - void GetCorruptedBackups(std::vector* corrupt_backup_ids); + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) = 0; // restore from backup with backup_id - // IMPORTANT -- if options_.share_table_files == true and you restore DB - // from some backup that is not the latest, and you start creating new - // backups from the new DB, they will probably fail + // IMPORTANT -- if options_.share_table_files == true, + // options_.share_files_with_checksum == false, you restore DB from some + // backup that is not the latest, and you start creating new backups from the + // new DB, they will probably fail. // // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. // If you add new data to the DB and try creating a new backup now, the // database will diverge from backups 4 and 5 and the new backup will fail. // If you want to create new backup, you will first have to delete backups 4 // and 5. - Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, - const std::string& wal_dir, - const RestoreOptions& restore_options = - RestoreOptions()); + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; // restore from the latest backup - Status RestoreDBFromLatestBackup(const std::string& db_dir, - const std::string& wal_dir, - const RestoreOptions& restore_options = - RestoreOptions()); - // deletes old backups, keeping latest num_backups_to_keep alive - Status PurgeOldBackups(uint32_t num_backups_to_keep); - // deletes a specific backup - Status DeleteBackup(BackupID backup_id); + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + + // checks that each file exists and that the size of the file matches our + // expectations. it does not check file checksum. + // Returns Status::OK() if all checks are good + virtual Status VerifyBackup(BackupID backup_id) = 0; // Will delete all the files we don't need anymore // It will do the full scan of the files/ directory and delete all the // files that are not referenced. - Status GarbageCollect(); - - private: - BackupEngine* backup_engine_; - Status status_; + virtual Status GarbageCollect() = 0; }; } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/utilities/checkpoint.h b/external/rocksdb/include/rocksdb/utilities/checkpoint.h index b2d5458e55..b4523c25ef 100644 --- a/external/rocksdb/include/rocksdb/utilities/checkpoint.h +++ b/external/rocksdb/include/rocksdb/utilities/checkpoint.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/convenience.h b/external/rocksdb/include/rocksdb/utilities/convenience.h index fae420b778..b0ac15c6df 100644 --- a/external/rocksdb/include/rocksdb/utilities/convenience.h +++ b/external/rocksdb/include/rocksdb/utilities/convenience.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/db_ttl.h b/external/rocksdb/include/rocksdb/utilities/db_ttl.h index 4534e1ff71..09107c50c4 100644 --- a/external/rocksdb/include/rocksdb/utilities/db_ttl.h +++ b/external/rocksdb/include/rocksdb/utilities/db_ttl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/document_db.h b/external/rocksdb/include/rocksdb/utilities/document_db.h index 7fde5ec9f1..52f2257058 100644 --- a/external/rocksdb/include/rocksdb/utilities/document_db.h +++ b/external/rocksdb/include/rocksdb/utilities/document_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/env_librados.h b/external/rocksdb/include/rocksdb/utilities/env_librados.h new file mode 100644 index 0000000000..5c10ea7ccf --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/env_librados.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H +#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H + +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/utilities/env_mirror.h" + +#include + +namespace rocksdb { +class LibradosWritableFile; + +class EnvLibrados : public EnvWrapper { +public: + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + Status NewSequentialFile( + const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + Status NewRandomAccessFile( + const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + Status NewWritableFile( + const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Reuse an existing file by renaming it and opening it as writable. + Status ReuseWritableFile( + const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + Status NewDirectory( + const std::string& name, + std::unique_ptr* result); + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + Status FileExists(const std::string& fname); + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + Status GetChildren(const std::string& dir, + std::vector* result); + + // Delete the named file. + Status DeleteFile(const std::string& fname); + + // Create the specified directory. Returns error if directory exists. + Status CreateDir(const std::string& dirname); + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + Status CreateDirIfMissing(const std::string& dirname); + + // Delete the specified directory. + Status DeleteDir(const std::string& dirname); + + // Store the size of fname in *file_size. + Status GetFileSize(const std::string& fname, uint64_t* file_size); + + // Store the last modification time of fname in *file_mtime. + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime); + // Rename file src to target. + Status RenameFile(const std::string& src, + const std::string& target); + // Hard Link file src to target. + Status LinkFile(const std::string& src, const std::string& target); + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + Status LockFile(const std::string& fname, FileLock** lock); + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + Status UnlockFile(FileLock* lock); + + // Get full directory name for this db. + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path); + + // Generate unique id + std::string GenerateUniqueId(); + + // Get default EnvLibrados + static EnvLibrados* Default(); + + explicit EnvLibrados(const std::string& db_name, + const std::string& config_path, + const std::string& db_pool); + + explicit EnvLibrados(const std::string& client_name, // first 3 parameters are for RADOS client init + const std::string& cluster_name, + const uint64_t flags, + const std::string& db_name, + const std::string& config_path, + const std::string& db_pool, + const std::string& wal_dir, + const std::string& wal_pool, + const uint64_t write_buffer_size); + ~EnvLibrados() { + _rados.shutdown(); + } +private: + std::string _client_name; + std::string _cluster_name; + uint64_t _flags; + std::string _db_name; // get from user, readable string; Also used as db_id for db metadata + std::string _config_path; + librados::Rados _rados; // RADOS client + std::string _db_pool_name; + librados::IoCtx _db_pool_ioctx; // IoCtx for connecting db_pool + std::string _wal_dir; // WAL dir path + std::string _wal_pool_name; + librados::IoCtx _wal_pool_ioctx; // IoCtx for connecting wal_pool + uint64_t _write_buffer_size; // WritableFile buffer max size + + /* private function to communicate with rados */ + std::string _CreateFid(); + Status _GetFid(const std::string& fname, std::string& fid); + Status _GetFid(const std::string& fname, std::string& fid, int fid_len); + Status _RenameFid(const std::string& old_fname, const std::string& new_fname); + Status _AddFid(const std::string& fname, const std::string& fid); + Status _DelFid(const std::string& fname); + Status _GetSubFnames( + const std::string& dirname, + std::vector * result + ); + librados::IoCtx* _GetIoctx(const std::string& prefix); + friend class LibradosWritableFile; +}; +} +#endif diff --git a/external/rocksdb/include/rocksdb/utilities/env_mirror.h b/external/rocksdb/include/rocksdb/utilities/env_mirror.h new file mode 100644 index 0000000000..021fbfa45b --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/env_mirror.h @@ -0,0 +1,166 @@ +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// MirrorEnv is an Env implementation that mirrors all file-related +// operations to two backing Env's (provided at construction time). +// Writes are mirrored. For read operations, we do the read from both +// backends and assert that the results match. +// +// This is useful when implementing a new Env and ensuring that the +// semantics and behavior are correct (in that they match that of an +// existing, stable Env, like the default POSIX one). + +#ifndef ROCKSDB_LITE + +#ifndef STORAGE_ROCKSDB_INCLUDE_UTILITIES_ENVMIRROR_H_ +#define STORAGE_ROCKSDB_INCLUDE_UTLIITIES_ENVMIRROR_H_ + +#include +#include +#include +#include "rocksdb/env.h" + +namespace rocksdb { + +class SequentialFileMirror; +class RandomAccessFileMirror; +class WritableFileMirror; + +class EnvMirror : public EnvWrapper { + Env* a_, *b_; + + public: + EnvMirror(Env* a, Env* b) : EnvWrapper(a), a_(a), b_(b) {} + + Status NewSequentialFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) override; + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + unique_ptr* r, + const EnvOptions& options) override; + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) override { + unique_ptr br; + Status as = a_->NewDirectory(name, result); + Status bs = b_->NewDirectory(name, &br); + assert(as == bs); + return as; + } + Status FileExists(const std::string& f) override { + Status as = a_->FileExists(f); + Status bs = b_->FileExists(f); + assert(as == bs); + return as; + } + Status GetChildren(const std::string& dir, + std::vector* r) override { + std::vector ar, br; + Status as = a_->GetChildren(dir, &ar); + Status bs = b_->GetChildren(dir, &br); + assert(as == bs); + std::sort(ar.begin(), ar.end()); + std::sort(br.begin(), br.end()); + if (!as.ok() || ar != br) { + assert(0 == "getchildren results don't match"); + } + *r = ar; + return as; + } + Status DeleteFile(const std::string& f) override { + Status as = a_->DeleteFile(f); + Status bs = b_->DeleteFile(f); + assert(as == bs); + return as; + } + Status CreateDir(const std::string& d) override { + Status as = a_->CreateDir(d); + Status bs = b_->CreateDir(d); + assert(as == bs); + return as; + } + Status CreateDirIfMissing(const std::string& d) override { + Status as = a_->CreateDirIfMissing(d); + Status bs = b_->CreateDirIfMissing(d); + assert(as == bs); + return as; + } + Status DeleteDir(const std::string& d) override { + Status as = a_->DeleteDir(d); + Status bs = b_->DeleteDir(d); + assert(as == bs); + return as; + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + uint64_t asize, bsize; + Status as = a_->GetFileSize(f, &asize); + Status bs = b_->GetFileSize(f, &bsize); + assert(as == bs); + assert(!as.ok() || asize == bsize); + *s = asize; + return as; + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + uint64_t amtime, bmtime; + Status as = a_->GetFileModificationTime(fname, &amtime); + Status bs = b_->GetFileModificationTime(fname, &bmtime); + assert(as == bs); + assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000); + *file_mtime = amtime; + return as; + } + + Status RenameFile(const std::string& s, const std::string& t) override { + Status as = a_->RenameFile(s, t); + Status bs = b_->RenameFile(s, t); + assert(as == bs); + return as; + } + + Status LinkFile(const std::string& s, const std::string& t) override { + Status as = a_->LinkFile(s, t); + Status bs = b_->LinkFile(s, t); + assert(as == bs); + return as; + } + + class FileLockMirror : public FileLock { + public: + FileLock* a_, *b_; + FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {} + }; + + Status LockFile(const std::string& f, FileLock** l) override { + FileLock* al, *bl; + Status as = a_->LockFile(f, &al); + Status bs = b_->LockFile(f, &bl); + assert(as == bs); + if (as.ok()) *l = new FileLockMirror(al, bl); + return as; + } + + Status UnlockFile(FileLock* l) override { + FileLockMirror* ml = static_cast(l); + Status as = a_->UnlockFile(ml->a_); + Status bs = b_->UnlockFile(ml->b_); + assert(as == bs); + return as; + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_UTILITIES_ENVMIRROR_H_ + +#endif // ROCKSDB_LITE diff --git a/external/rocksdb/include/rocksdb/utilities/env_registry.h b/external/rocksdb/include/rocksdb/utilities/env_registry.h new file mode 100644 index 0000000000..4074c87f00 --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/env_registry.h @@ -0,0 +1,45 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/env.h" + +namespace rocksdb { + +// Returns a new Env when called with a URI string. Populates the unique_ptr +// argument if granting ownership to caller. +typedef std::function*)> + EnvFactoryFunc; + +// Creates a new Env using the registered factory function corresponding to a +// prefix of uri. +// +// If no prefixes match, returns nullptr. If multiple prefixes match, the +// factory function used is unspecified. +// +// Populates env_guard with result pointer if caller is granted ownership. +Env* NewEnvFromUri(const std::string& uri, std::unique_ptr* env_guard); + +// To register an Env factory function, initialize an EnvRegistrar object with +// static storage duration. For example: +// +// static EnvRegistrar hdfs_reg("hdfs://", &CreateHdfsEnv); +// +// Then, calling NewEnvFromUri("hdfs://some_path", ...) will use CreateHdfsEnv +// to make a new Env. +class EnvRegistrar { + public: + explicit EnvRegistrar(std::string uri_prefix, EnvFactoryFunc env_factory); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/external/rocksdb/include/rocksdb/utilities/flashcache.h b/external/rocksdb/include/rocksdb/utilities/flashcache.h index 7bb7609248..b54d245f06 100644 --- a/external/rocksdb/include/rocksdb/utilities/flashcache.h +++ b/external/rocksdb/include/rocksdb/utilities/flashcache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/geo_db.h b/external/rocksdb/include/rocksdb/utilities/geo_db.h index 41c0f14081..37e5ebdc74 100644 --- a/external/rocksdb/include/rocksdb/utilities/geo_db.h +++ b/external/rocksdb/include/rocksdb/utilities/geo_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -59,6 +59,16 @@ class GeoObject { } }; +class GeoIterator { + public: + GeoIterator() = default; + virtual ~GeoIterator() {} + virtual void Next() = 0; + virtual bool Valid() const = 0; + virtual const GeoObject& geo_object() = 0; + virtual Status status() const = 0; +}; + // // Stack your DB with GeoDB to be able to get geo-spatial support // @@ -91,14 +101,13 @@ class GeoDB : public StackableDB { // Delete the specified object virtual Status Remove(const Slice& id) = 0; - // Returns a list of all items within a circular radius from the + // Returns an iterator for the items within a circular radius from the // specified gps location. If 'number_of_values' is specified, - // then this call returns at most that many number of objects. + // then the iterator is capped to that number of objects. // The radius is specified in 'meters'. - virtual Status SearchRadial(const GeoPosition& pos, - double radius, - std::vector* values, - int number_of_values = INT_MAX) = 0; + virtual GeoIterator* SearchRadial(const GeoPosition& pos, + double radius, + int number_of_values = INT_MAX) = 0; }; } // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/utilities/info_log_finder.h b/external/rocksdb/include/rocksdb/utilities/info_log_finder.h index 916c54c282..4b7530c28b 100644 --- a/external/rocksdb/include/rocksdb/utilities/info_log_finder.h +++ b/external/rocksdb/include/rocksdb/utilities/info_log_finder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/json_document.h b/external/rocksdb/include/rocksdb/utilities/json_document.h index a5e3ab2562..9473258c8d 100644 --- a/external/rocksdb/include/rocksdb/utilities/json_document.h +++ b/external/rocksdb/include/rocksdb/utilities/json_document.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/external/rocksdb/include/rocksdb/utilities/ldb_cmd.h new file mode 100644 index 0000000000..7df0cd76f0 --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/ldb_cmd.h @@ -0,0 +1,251 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/ldb_tool.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/ldb_cmd_execute_result.h" + +namespace rocksdb { + +class LDBCommand { + public: + // Command-line arguments + static const std::string ARG_DB; + static const std::string ARG_PATH; + static const std::string ARG_HEX; + static const std::string ARG_KEY_HEX; + static const std::string ARG_VALUE_HEX; + static const std::string ARG_CF_NAME; + static const std::string ARG_TTL; + static const std::string ARG_TTL_START; + static const std::string ARG_TTL_END; + static const std::string ARG_TIMESTAMP; + static const std::string ARG_FROM; + static const std::string ARG_TO; + static const std::string ARG_MAX_KEYS; + static const std::string ARG_BLOOM_BITS; + static const std::string ARG_FIX_PREFIX_LEN; + static const std::string ARG_COMPRESSION_TYPE; + static const std::string ARG_COMPRESSION_MAX_DICT_BYTES; + static const std::string ARG_BLOCK_SIZE; + static const std::string ARG_AUTO_COMPACTION; + static const std::string ARG_DB_WRITE_BUFFER_SIZE; + static const std::string ARG_WRITE_BUFFER_SIZE; + static const std::string ARG_FILE_SIZE; + static const std::string ARG_CREATE_IF_MISSING; + static const std::string ARG_NO_VALUE; + + struct ParsedParams { + std::string cmd; + std::vector cmd_params; + std::map option_map; + std::vector flags; + }; + + static LDBCommand* SelectCommand(const ParsedParams& parsed_parms); + + static LDBCommand* InitFromCmdLineArgs( + const std::vector& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families, + const std::function& selector = + SelectCommand); + + static LDBCommand* InitFromCmdLineArgs( + int argc, char** argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families); + + bool ValidateCmdLineOptions(); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void SetDBOptions(Options options) { options_ = options; } + + virtual void SetColumnFamilies( + const std::vector* column_families) { + if (column_families != nullptr) { + column_families_ = *column_families; + } else { + column_families_.clear(); + } + } + + void SetLDBOptions(const LDBOptions& ldb_options) { + ldb_options_ = ldb_options; + } + + virtual bool NoDBOpen() { return false; } + + virtual ~LDBCommand() { CloseDB(); } + + /* Run the command, and return the execute result. */ + void Run(); + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { return exec_state_; } + + void ClearPreviousRunState() { exec_state_.Reset(); } + + // Consider using Slice::DecodeHex directly instead if you don't need the + // 0x prefix + static std::string HexToString(const std::string& str); + + // Consider using Slice::ToString(true) directly instead if + // you don't need the 0x prefix + static std::string StringToHex(const std::string& str); + + static const char* DELIM; + + protected: + LDBCommandExecuteResult exec_state_; + std::string db_path_; + std::string column_family_name_; + DB* db_; + DBWithTTL* db_ttl_; + std::map cf_handles_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + /** + * Map of options passed on the command-line. + */ + const std::map option_map_; + + /** + * Flags passed on the command-line. + */ + const std::vector flags_; + + /** List of command-line options valid for this command */ + const std::vector valid_cmd_line_options_; + + bool ParseKeyValue(const std::string& line, std::string* key, + std::string* value, bool is_key_hex, bool is_value_hex); + + LDBCommand(const std::map& options, + const std::vector& flags, bool is_read_only, + const std::vector& valid_cmd_line_options); + + void OpenDB(); + + void CloseDB(); + + ColumnFamilyHandle* GetCfHandle(); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_key_hex, + bool is_value_hex); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_hex); + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const std::vector& flags, + const std::string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static std::string HelpRangeCmdArgs(); + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + static std::vector BuildCmdLineOptions( + std::vector options); + + bool ParseIntOption(const std::map& options, + const std::string& option, int& value, + LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const std::map& options, + const std::string& option, std::string* value); + + Options options_; + std::vector column_families_; + LDBOptions ldb_options_; + + private: + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const std::map& options, + const std::vector& flags); + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const std::map& options, + const std::vector& flags); + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const std::map& options, + const std::string& option, bool default_val); + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(std::string val); +}; + +class LDBCommandRunner { + public: + static void PrintHelp(const char* exec_name); + + static void RunCommand( + int argc, char** argv, Options options, const LDBOptions& ldb_options, + const std::vector* column_families); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/external/rocksdb/util/ldb_cmd_execute_result.h b/external/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h similarity index 95% rename from external/rocksdb/util/ldb_cmd_execute_result.h rename to external/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h index 29ebfc240c..94f271c86e 100644 --- a/external/rocksdb/util/ldb_cmd_execute_result.h +++ b/external/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/leveldb_options.h b/external/rocksdb/include/rocksdb/utilities/leveldb_options.h index 8e2c3a1d58..d17bcb0d0d 100644 --- a/external/rocksdb/include/rocksdb/utilities/leveldb_options.h +++ b/external/rocksdb/include/rocksdb/utilities/leveldb_options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,7 +21,7 @@ class Logger; struct Options; class Snapshot; -enum CompressionType : char; +enum CompressionType : unsigned char; // Options to control the behavior of a database (passed to // DB::Open). A LevelDBOptions object can be initialized as though diff --git a/external/rocksdb/include/rocksdb/utilities/memory_util.h b/external/rocksdb/include/rocksdb/utilities/memory_util.h new file mode 100644 index 0000000000..d89bb6adc4 --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/memory_util.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" + +namespace rocksdb { + +// Returns the current memory usage of the specified DB instances. +class MemoryUtil { + public: + enum UsageType : int { + // Memory usage of all the mem-tables. + kMemTableTotal = 0, + // Memory usage of those un-flushed mem-tables. + kMemTableUnFlushed = 1, + // Memory usage of all the table readers. + kTableReadersTotal = 2, + // Memory usage by Cache. + kCacheTotal = 3, + kNumUsageTypes = 4 + }; + + // Returns the approximate memory usage of different types in the input + // list of DBs and Cache set. For instance, in the output map + // usage_by_type, usage_by_type[kMemTableTotal] will store the memory + // usage of all the mem-tables from all the input rocksdb instances. + // + // Note that for memory usage inside Cache class, we will + // only report the usage of the input "cache_set" without + // including those Cache usage inside the input list "dbs" + // of DBs. + static Status GetApproximateMemoryUsageByType( + const std::vector& dbs, + const std::unordered_set cache_set, + std::map* usage_by_type); +}; +} // namespace rocksdb +#endif // !ROCKSDB_LITE diff --git a/external/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/external/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h index 772e645490..b2c2f99a87 100644 --- a/external/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/external/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -43,15 +43,19 @@ class OptimisticTransactionDB { virtual ~OptimisticTransactionDB() {} - // Starts a new Transaction. Passing set_snapshot=true has the same effect - // as calling SetSnapshot(). + // Starts a new Transaction. // - // Caller should delete the returned transaction after calling - // Commit() or Rollback(). + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. virtual Transaction* BeginTransaction( const WriteOptions& write_options, - const OptimisticTransactionOptions& - txn_options = OptimisticTransactionOptions()) = 0; + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions(), + Transaction* old_txn = nullptr) = 0; // Return the underlying Database that was opened virtual DB* GetBaseDB() = 0; diff --git a/external/rocksdb/include/rocksdb/utilities/option_change_migration.h b/external/rocksdb/include/rocksdb/utilities/option_change_migration.h new file mode 100644 index 0000000000..aa14a02999 --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/option_change_migration.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace rocksdb { +// Try to migrate DB created with old_opts to be use new_opts. +// Multiple column families is not supported. +// It is best-effort. No guarantee to succeed. +// A full compaction may be executed. +Status OptionChangeMigration(std::string dbname, const Options& old_opts, + const Options& new_opts); +} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/utilities/options_util.h b/external/rocksdb/include/rocksdb/utilities/options_util.h new file mode 100644 index 0000000000..1d961a2bb8 --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/options_util.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +// This file contains utility functions for RocksDB Options. +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace rocksdb { +// Constructs the DBOptions and ColumnFamilyDescriptors by loading the +// latest RocksDB options file stored in the specified rocksdb database. +// +// Note that the all the pointer options (except table_factory, which will +// be described in more details below) will be initialized with the default +// values. Developers can further initialize them after this function call. +// Below is an example list of pointer options which will be initialized +// +// * env +// * memtable_factory +// * compaction_filter_factory +// * prefix_extractor +// * comparator +// * merge_operator +// * compaction_filter +// +// For table_factory, this function further supports deserializing +// BlockBasedTableFactory and its BlockBasedTableOptions except the +// pointer options of BlockBasedTableOptions (flush_block_policy_factory, +// block_cache, and block_cache_compressed), which will be initialized with +// default values. Developers can further specify these three options by +// casting the return value of TableFactoroy::GetOptions() to +// BlockBasedTableOptions and making necessary changes. +// +// examples/options_file_example.cc demonstrates how to use this function +// to open a RocksDB instance. +// +// @return the function returns an OK status when it went successfully. If +// the specified "dbpath" does not contain any option file, then a +// Status::NotFound will be returned. A return value other than +// Status::OK or Status::NotFound indicates there're some error related +// to the options file itself. +// +// @see LoadOptionsFromFile +Status LoadLatestOptions(const std::string& dbpath, Env* env, + DBOptions* db_options, + std::vector* cf_descs); + +// Similar to LoadLatestOptions, this function constructs the DBOptions +// and ColumnFamilyDescriptors based on the specified RocksDB Options file. +// +// @see LoadLatestOptions +Status LoadOptionsFromFile(const std::string& options_file_name, Env* env, + DBOptions* db_options, + std::vector* cf_descs); + +// Returns the latest options file name under the specified db path. +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name); + +// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors +// are compatible with the latest options stored in the specified DB path. +// +// If the return status is non-ok, it means the specified RocksDB instance +// might not be correctly opened with the input set of options. Currently, +// changing one of the following options will fail the compatibility check: +// +// * comparator +// * prefix_extractor +// * table_factory +// * merge_operator +Status CheckOptionsCompatibility( + const std::string& dbpath, Env* env, const DBOptions& db_options, + const std::vector& cf_descs); + +} // namespace rocksdb +#endif // !ROCKSDB_LITE diff --git a/external/rocksdb/include/rocksdb/utilities/sim_cache.h b/external/rocksdb/include/rocksdb/utilities/sim_cache.h new file mode 100644 index 0000000000..cc8a01beca --- /dev/null +++ b/external/rocksdb/include/rocksdb/utilities/sim_cache.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class SimCache; + +// For instrumentation purpose, use NewSimCache instead of NewLRUCache API +// NewSimCache is a wrapper function returning a SimCache instance that can +// have additional interface provided in Simcache class besides Cache interface +// to predict block cache hit rate without actually allocating the memory. It +// can help users tune their current block cache size, and determine how +// efficient they are using the memory. +extern std::shared_ptr NewSimCache(std::shared_ptr cache, + size_t sim_capacity, + int num_shard_bits); + +class SimCache : public Cache { + public: + SimCache() {} + + virtual ~SimCache() {} + + // returns the maximum configured capacity of the simcache for simulation + virtual size_t GetSimCapacity() const = 0; + + // simcache doesn't provide internal handler reference to user, so always + // PinnedUsage = 0 and the behavior will be not exactly consistent the + // with real cache. + // returns the memory size for the entries residing in the simcache. + virtual size_t GetSimUsage() const = 0; + + // sets the maximum configured capacity of the simcache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will purge old entries + // to fit new capapicty. + virtual void SetSimCapacity(size_t capacity) = 0; + + // returns the lookup times of simcache + virtual uint64_t get_lookup_counter() const = 0; + // returns the hit times of simcache + virtual uint64_t get_hit_counter() const = 0; + // returns the hit rate of simcache + virtual double get_hit_rate() const = 0; + // reset the lookup and hit counters + virtual void reset_counter() = 0; + // String representation of the statistics of the simcache + virtual std::string ToString() const = 0; + + private: + SimCache(const SimCache&); + SimCache& operator=(const SimCache&); +}; + +} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/utilities/spatial_db.h b/external/rocksdb/include/rocksdb/utilities/spatial_db.h index 50abbf446f..108915fd77 100644 --- a/external/rocksdb/include/rocksdb/utilities/spatial_db.h +++ b/external/rocksdb/include/rocksdb/utilities/spatial_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/stackable_db.h b/external/rocksdb/include/rocksdb/utilities/stackable_db.h index aef192b07e..7565088e0d 100644 --- a/external/rocksdb/include/rocksdb/utilities/stackable_db.h +++ b/external/rocksdb/include/rocksdb/utilities/stackable_db.h @@ -40,6 +40,11 @@ class StackableDB : public DB { return db_->DropColumnFamily(column_family); } + virtual Status DestroyColumnFamilyHandle( + ColumnFamilyHandle* column_family) override { + return db_->DestroyColumnFamilyHandle(column_family); + } + using DB::Put; virtual Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key, @@ -65,14 +70,14 @@ class StackableDB : public DB { using DB::AddFile; virtual Status AddFile(ColumnFamilyHandle* column_family, - const ExternalSstFileInfo* file_info, + const std::vector& file_info_list, bool move_file) override { - return db_->AddFile(column_family, file_info, move_file); + return db_->AddFile(column_family, file_info_list, move_file); } virtual Status AddFile(ColumnFamilyHandle* column_family, - const std::string& file_path, + const std::vector& file_path_list, bool move_file) override { - return db_->AddFile(column_family, file_path, move_file); + return db_->AddFile(column_family, file_path_list, move_file); } using DB::KeyMayExist; @@ -144,11 +149,18 @@ class StackableDB : public DB { return db_->GetIntProperty(column_family, property, value); } + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { + return db_->GetAggregatedIntProperty(property, value); + } + using DB::GetApproximateSizes; virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes, bool include_memtable = false) override { - return db_->GetApproximateSizes(column_family, r, n, sizes); + return db_->GetApproximateSizes(column_family, r, n, sizes, + include_memtable); } using DB::CompactRange; @@ -176,6 +188,11 @@ class StackableDB : public DB { return db_->ContinueBackgroundWork(); } + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override { + return db_->EnableAutoCompaction(column_family_handles); + } + using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family) override { return db_->NumberLevels(column_family); @@ -267,9 +284,10 @@ class StackableDB : public DB { } using DB::SetOptions; - virtual Status SetOptions( - const std::unordered_map& new_options) override { - return db_->SetOptions(new_options); + virtual Status SetOptions(ColumnFamilyHandle* column_family_handle, + const std::unordered_map& + new_options) override { + return db_->SetOptions(column_family_handle, new_options); } using DB::GetPropertiesOfAllTables; @@ -279,6 +297,13 @@ class StackableDB : public DB { return db_->GetPropertiesOfAllTables(column_family, props); } + using DB::GetPropertiesOfTablesInRange; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); + } + virtual Status GetUpdatesSince( SequenceNumber seq_number, unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options) override { diff --git a/external/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/external/rocksdb/include/rocksdb/utilities/table_properties_collectors.h index d31baf9a0c..68a88e7180 100644 --- a/external/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +++ b/external/rocksdb/include/rocksdb/utilities/table_properties_collectors.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/utilities/transaction.h b/external/rocksdb/include/rocksdb/utilities/transaction.h index 6c2640a8e7..5a6c0cf45c 100644 --- a/external/rocksdb/include/rocksdb/utilities/transaction.h +++ b/external/rocksdb/include/rocksdb/utilities/transaction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,6 +20,19 @@ class Iterator; class TransactionDB; class WriteBatchWithIndex; +typedef std::string TransactionName; + +// Provides notification to the caller of SetSnapshotOnNextOperation when +// the actual snapshot gets created +class TransactionNotifier { + public: + virtual ~TransactionNotifier() {} + + // Implement this method to receive notification when a snapshot is + // requested via SetSnapshotOnNextOperation. + virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0; +}; + // Provides BEGIN/COMMIT/ROLLBACK transactions. // // To use transactions, you must first create either an OptimisticTransactionDB @@ -61,12 +74,51 @@ class Transaction { // methods. See Transaction::Get() for more details. virtual void SetSnapshot() = 0; + // Similar to SetSnapshot(), but will not change the current snapshot + // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called. + // By calling this function, the transaction will essentially call + // SetSnapshot() for you right before performing the next write/GetForUpdate. + // + // Calling SetSnapshotOnNextOperation() will not affect what snapshot is + // returned by GetSnapshot() until the next write/GetForUpdate is executed. + // + // When the snapshot is created the notifier's SnapshotCreated method will + // be called so that the caller can get access to the snapshot. + // + // This is an optimization to reduce the likelihood of conflicts that + // could occur in between the time SetSnapshot() is called and the first + // write/GetForUpdate operation. Eg, this prevents the following + // race-condition: + // + // txn1->SetSnapshot(); + // txn2->Put("A", ...); + // txn2->Commit(); + // txn1->GetForUpdate(opts, "A", ...); // FAIL! + virtual void SetSnapshotOnNextOperation( + std::shared_ptr notifier = nullptr) = 0; + // Returns the Snapshot created by the last call to SetSnapshot(). // // REQUIRED: The returned Snapshot is only valid up until the next time - // SetSnapshot() is called or the Transaction is deleted. + // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot() + // is called, or the Transaction is deleted. virtual const Snapshot* GetSnapshot() const = 0; + // Clears the current snapshot (i.e. no snapshot will be 'set') + // + // This removes any snapshot that currently exists or is set to be created + // on the next update operation (SetSnapshotOnNextOperation). + // + // Calling ClearSnapshot() has no effect on keys written before this function + // has been called. + // + // If a reference to a snapshot was retrieved via GetSnapshot(), it will no + // longer be valid and should be discarded after a call to ClearSnapshot(). + virtual void ClearSnapshot() = 0; + + // Prepare the current transation for 2PC + virtual Status Prepare() = 0; + // Write all batched keys to the db atomically. // // Returns OK on success. @@ -85,7 +137,7 @@ class Transaction { virtual Status Commit() = 0; // Discard all batched writes in this transaction. - virtual void Rollback() = 0; + virtual Status Rollback() = 0; // Records the state of the transaction for future calls to // RollbackToSavePoint(). May be called multiple times to set multiple save @@ -178,14 +230,10 @@ class Transaction { // in this transaction do not yet belong to any snapshot and will be fetched // regardless). // - // Caller is reponsible for deleting the returned Iterator. + // Caller is responsible for deleting the returned Iterator. // // The returned iterator is only valid until Commit(), Rollback(), or // RollbackToSavePoint() is called. - // NOTE: Transaction::Put/Merge/Delete will currently invalidate this iterator - // until - // the following issue is fixed: - // https://github.com/facebook/rocksdb/issues/616 virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; virtual Iterator* GetIterator(const ReadOptions& read_options, @@ -263,6 +311,21 @@ class Transaction { // Similar to WriteBatch::PutLogData virtual void PutLogData(const Slice& blob) = 0; + // By default, all Put/Merge/Delete operations will be indexed in the + // transaction so that Get/GetForUpdate/GetIterator can search for these + // keys. + // + // If the caller does not want to fetch the keys about to be written, + // they may want to avoid indexing as a performance optimization. + // Calling DisableIndexing() will turn off indexing for all future + // Put/Merge/Delete operations until EnableIndexing() is called. + // + // If a key is Put/Merge/Deleted after DisableIndexing is called and then + // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is + // undefined. + virtual void DisableIndexing() = 0; + virtual void EnableIndexing() = 0; + // Returns the number of distinct Keys being tracked by this transaction. // If this transaction was created by a TransactinDB, this is the number of // keys that are currently locked by this transaction. @@ -283,7 +346,7 @@ class Transaction { // committed. // // Note: You should not write or delete anything from the batch directly and - // should only use the the functions in the Transaction class to + // should only use the functions in the Transaction class to // write to this transaction. virtual WriteBatchWithIndex* GetWriteBatch() = 0; @@ -292,10 +355,69 @@ class Transaction { // Has no effect on OptimisticTransactions. virtual void SetLockTimeout(int64_t timeout) = 0; + // Return the WriteOptions that will be used during Commit() + virtual WriteOptions* GetWriteOptions() = 0; + + // Reset the WriteOptions that will be used during Commit(). + virtual void SetWriteOptions(const WriteOptions& write_options) = 0; + + // If this key was previously fetched in this transaction using + // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell + // the transaction that it no longer needs to do any conflict checking + // for this key. + // + // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(), + // then UndoGetForUpdate will only have an effect if it is also called N + // times. If this key has been written to in this transaction, + // UndoGetForUpdate() will have no effect. + // + // If SetSavePoint() has been called after the GetForUpdate(), + // UndoGetForUpdate() will not have any effect. + // + // If this Transaction was created by an OptimisticTransactionDB, + // calling UndoGetForUpdate can affect whether this key is conflict checked + // at commit time. + // If this Transaction was created by a TransactionDB, + // calling UndoGetForUpdate may release any held locks for this key. + virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual void UndoGetForUpdate(const Slice& key) = 0; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0; + + virtual WriteBatch* GetCommitTimeWriteBatch() = 0; + + virtual void SetLogNumber(uint64_t log) { log_number_ = log; } + + virtual uint64_t GetLogNumber() { return log_number_; } + + virtual Status SetName(const TransactionName& name) = 0; + + virtual TransactionName GetName() { return name_; } + + enum ExecutionStatus { + STARTED = 0, + AWAITING_PREPARE = 1, + PREPARED = 2, + AWAITING_COMMIT = 3, + COMMITED = 4, + AWAITING_ROLLBACK = 5, + ROLLEDBACK = 6, + LOCKS_STOLEN = 7, + }; + + // Execution status of the transaction. + std::atomic exec_status_; + protected: explicit Transaction(const TransactionDB* db) {} Transaction() {} + // the log in which the prepared section for this txn resides + // (for two phase commit) + uint64_t log_number_; + TransactionName name_; + private: // No copying allowed Transaction(const Transaction&); diff --git a/external/rocksdb/include/rocksdb/utilities/transaction_db.h b/external/rocksdb/include/rocksdb/utilities/transaction_db.h index f9023fc21f..4b64754245 100644 --- a/external/rocksdb/include/rocksdb/utilities/transaction_db.h +++ b/external/rocksdb/include/rocksdb/utilities/transaction_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -54,10 +54,10 @@ struct TransactionDBOptions { // If negative, there is no timeout and will block indefinitely when acquiring // a lock. // - // Not using a a timeout can lead to deadlocks. Currently, there + // Not using a timeout can lead to deadlocks. Currently, there // is no deadlock-detection to recover from a deadlock. While DB writes // cannot deadlock with other DB writes, they can deadlock with a transaction. - // A negative timeout should only be used if all transactions have an small + // A negative timeout should only be used if all transactions have a small // expiration set. int64_t default_lock_timeout = 1000; // 1 second @@ -92,8 +92,6 @@ struct TransactionOptions { // will never relinquish any locks it holds. This could prevent keys from // being // written by other writers. - // - // TODO(agiardullo): Improve performance of checking expiration time. int64_t expiration = -1; }; @@ -113,14 +111,21 @@ class TransactionDB : public StackableDB { virtual ~TransactionDB() {} - // Starts a new Transaction. Passing set_snapshot=true has the same effect - // as calling Transaction::SetSnapshot(). + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. // - // Caller should delete the returned transaction after calling - // Transaction::Commit() or Transaction::Rollback(). + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. virtual Transaction* BeginTransaction( const WriteOptions& write_options, - const TransactionOptions& txn_options = TransactionOptions()) = 0; + const TransactionOptions& txn_options = TransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; + virtual void GetAllPreparedTransactions(std::vector* trans) = 0; protected: // To Create an TransactionDB, call Open() diff --git a/external/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/external/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h index 773ebc106e..cedf542955 100644 --- a/external/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +++ b/external/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -27,7 +27,7 @@ class TransactionDBMutex { // If returned status is OK, TransactionDB will eventually call UnLock(). virtual Status Lock() = 0; - // Attempt to acquire lock. If timeout is non-negative, operation should be + // Attempt to acquire lock. If timeout is non-negative, operation may be // failed after this many microseconds. // Returns OK on success, // TimedOut if timed out, diff --git a/external/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/external/rocksdb/include/rocksdb/utilities/write_batch_with_index.h index 1e41e78691..ccfd67e5ea 100644 --- a/external/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +++ b/external/rocksdb/include/rocksdb/utilities/write_batch_with_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -34,7 +34,8 @@ enum WriteType { kMergeRecord, kDeleteRecord, kSingleDeleteRecord, - kLogDataRecord + kLogDataRecord, + kXIDRecord, }; // an entry for Put, Merge, Delete, or SingleDelete entry for write batches. diff --git a/external/rocksdb/include/rocksdb/version.h b/external/rocksdb/include/rocksdb/version.h index 86a19393e0..d9cc91fae1 100644 --- a/external/rocksdb/include/rocksdb/version.h +++ b/external/rocksdb/include/rocksdb/version.h @@ -1,12 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #pragma once #define ROCKSDB_MAJOR 4 -#define ROCKSDB_MINOR 1 -#define ROCKSDB_PATCH 0 +#define ROCKSDB_MINOR 11 +#define ROCKSDB_PATCH 2 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff --git a/external/rocksdb/include/rocksdb/wal_filter.h b/external/rocksdb/include/rocksdb/wal_filter.h new file mode 100644 index 0000000000..131fe87e7c --- /dev/null +++ b/external/rocksdb/include/rocksdb/wal_filter.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#include +#include + +namespace rocksdb { + +class WriteBatch; + +// WALFilter allows an application to inspect write-ahead-log (WAL) +// records or modify their processing on recovery. +// Please see the details below. +class WalFilter { + public: + enum class WalProcessingOption { + // Continue processing as usual + kContinueProcessing = 0, + // Ignore the current record but continue processing of log(s) + kIgnoreCurrentRecord = 1, + // Stop replay of logs and discard logs + // Logs won't be replayed on subsequent recovery + kStopReplay = 2, + // Corrupted record detected by filter + kCorruptedRecord = 3, + // Marker for enum count + kWalProcessingOptionMax = 4 + }; + + virtual ~WalFilter() {} + + // Provide ColumnFamily->LogNumber map to filter + // so that filter can determine whether a log number applies to a given + // column family (i.e. that log hasn't been flushed to SST already for the + // column family). + // We also pass in name->id map as only name is known during + // recovery (as handles are opened post-recovery). + // while write batch callbacks happen in terms of column family id. + // + // @params cf_lognumber_map column_family_id to lognumber map + // @params cf_name_id_map column_family_name to column_family_id map + + virtual void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map) {} + + // LogRecord is invoked for each log record encountered for all the logs + // during replay on logs on recovery. This method can be used to: + // * inspect the record (using the batch parameter) + // * ignoring current record + // (by returning WalProcessingOption::kIgnoreCurrentRecord) + // * reporting corrupted record + // (by returning WalProcessingOption::kCorruptedRecord) + // * stop log replay + // (by returning kStop replay) - please note that this implies + // discarding the logs from current record onwards. + // + // @params log_number log_number of the current log. + // Filter might use this to determine if the log + // record is applicable to a certain column family. + // @params log_file_name log file name - only for informational purposes + // @params batch batch encountered in the log during recovery + // @params new_batch new_batch to populate if filter wants to change + // the batch (for example to filter some records out, + // or alter some records). + // Please note that the new batch MUST NOT contain + // more records than original, else recovery would + // be failed. + // @params batch_changed Whether batch was changed by the filter. + // It must be set to true if new_batch was populated, + // else new_batch has no effect. + // @returns Processing option for the current record. + // Please see WalProcessingOption enum above for + // details. + virtual WalProcessingOption LogRecordFound(unsigned long long log_number, + const std::string& log_file_name, + const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) { + // Default implementation falls back to older function for compatibility + return LogRecord(batch, new_batch, batch_changed); + } + + // Please see the comments for LogRecord above. This function is for + // compatibility only and contains a subset of parameters. + // New code should use the function above. + virtual WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const { + return WalProcessingOption::kContinueProcessing; + } + + // Returns a name that identifies this WAL filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +} // namespace rocksdb diff --git a/external/rocksdb/include/rocksdb/write_batch.h b/external/rocksdb/include/rocksdb/write_batch.h index a097f2169d..be6374cb39 100644 --- a/external/rocksdb/include/rocksdb/write_batch.h +++ b/external/rocksdb/include/rocksdb/write_batch.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,6 +25,7 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ #define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ +#include #include #include #include @@ -71,8 +72,7 @@ class WriteBatch : public WriteBatchBase { void Delete(const SliceParts& key) override { Delete(nullptr, key); } using WriteBatchBase::SingleDelete; - // If the database contains a mapping for "key", erase it. Expects that the - // key was not overwritten. Else do nothing. + // WriteBatch implementation of DB::SingleDelete(). See db.h. void SingleDelete(ColumnFamilyHandle* column_family, const Slice& key) override; void SingleDelete(const Slice& key) override { SingleDelete(nullptr, key); } @@ -125,7 +125,7 @@ class WriteBatch : public WriteBatchBase { // most recent call to SetSavePoint() and removes the most recent save point. // If there is no previous call to SetSavePoint(), Status::NotFound() // will be returned. - // Oterwise returns Status::OK(). + // Otherwise returns Status::OK(). Status RollbackToSavePoint() override; // Support for iterating over the contents of a batch. @@ -147,7 +147,7 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and PutCF not implemented"); } - virtual void Put(const Slice& key, const Slice& value) {} + virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {} virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { if (column_family_id == 0) { @@ -157,7 +157,7 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and DeleteCF not implemented"); } - virtual void Delete(const Slice& key) {} + virtual void Delete(const Slice& /*key*/) {} virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) { if (column_family_id == 0) { @@ -167,7 +167,7 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and SingleDeleteCF not implemented"); } - virtual void SingleDelete(const Slice& key) {} + virtual void SingleDelete(const Slice& /*key*/) {} // Merge and LogData are not pure virtual. Otherwise, we would break // existing clients of Handler on a source code level. The default @@ -181,11 +181,28 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and MergeCF not implemented"); } - virtual void Merge(const Slice& key, const Slice& value) {} + virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {} // The default implementation of LogData does nothing. virtual void LogData(const Slice& blob); + virtual Status MarkBeginPrepare() { + return Status::InvalidArgument("MarkBeginPrepare() handler not defined."); + } + + virtual Status MarkEndPrepare(const Slice& xid) { + return Status::InvalidArgument("MarkEndPrepare() handler not defined."); + } + + virtual Status MarkRollback(const Slice& xid) { + return Status::InvalidArgument( + "MarkRollbackPrepare() handler not defined."); + } + + virtual Status MarkCommit(const Slice& xid) { + return Status::InvalidArgument("MarkCommit() handler not defined."); + } + // Continue is called by WriteBatch::Iterate. If it returns false, // iteration is halted. Otherwise, it continues iterating. The default // implementation always returns true. @@ -202,17 +219,51 @@ class WriteBatch : public WriteBatchBase { // Returns the number of updates in the batch int Count() const; + // Returns true if PutCF will be called during Iterate + bool HasPut() const; + + // Returns true if DeleteCF will be called during Iterate + bool HasDelete() const; + + // Returns true if SingleDeleteCF will be called during Iterate + bool HasSingleDelete() const; + + // Returns trie if MergeCF will be called during Iterate + bool HasMerge() const; + + // Returns true if MarkBeginPrepare will be called during Iterate + bool HasBeginPrepare() const; + + // Returns true if MarkEndPrepare will be called during Iterate + bool HasEndPrepare() const; + + // Returns trie if MarkCommit will be called during Iterate + bool HasCommit() const; + + // Returns trie if MarkRollback will be called during Iterate + bool HasRollback() const; + using WriteBatchBase::GetWriteBatch; WriteBatch* GetWriteBatch() override { return this; } // Constructor with a serialized string object - explicit WriteBatch(const std::string& rep) - : save_points_(nullptr), rep_(rep) {} + explicit WriteBatch(const std::string& rep); + + WriteBatch(const WriteBatch& src); + WriteBatch(WriteBatch&& src); + WriteBatch& operator=(const WriteBatch& src); + WriteBatch& operator=(WriteBatch&& src); private: friend class WriteBatchInternal; SavePoints* save_points_; + // For HasXYZ. Mutable to allow lazy computation of results + mutable std::atomic content_flags_; + + // Performs deferred computation of content_flags if necessary + uint32_t ComputeContentFlags() const; + protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ diff --git a/external/rocksdb/include/rocksdb/write_batch_base.h b/external/rocksdb/include/rocksdb/write_batch_base.h index c4083754d3..86ccbaa185 100644 --- a/external/rocksdb/include/rocksdb/write_batch_base.h +++ b/external/rocksdb/include/rocksdb/write_batch_base.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/include/rocksdb/write_buffer_manager.h b/external/rocksdb/include/rocksdb/write_buffer_manager.h new file mode 100644 index 0000000000..7b4f095f9e --- /dev/null +++ b/external/rocksdb/include/rocksdb/write_buffer_manager.h @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBufferManager is for managing memory allocation for one or more +// MemTables. + +#pragma once + +#include +#include + +namespace rocksdb { + +class WriteBufferManager { + public: + // _buffer_size = 0 indicates no limit. Memory won't be tracked, + // memory_usage() won't be valid and ShouldFlush() will always return true. + explicit WriteBufferManager(size_t _buffer_size) + : buffer_size_(_buffer_size), memory_used_(0) {} + + ~WriteBufferManager() {} + + bool enabled() const { return buffer_size_ != 0; } + + // Only valid if enabled() + size_t memory_usage() const { + return memory_used_.load(std::memory_order_relaxed); + } + size_t buffer_size() const { return buffer_size_; } + + // Should only be called from write thread + bool ShouldFlush() const { + return enabled() && memory_usage() >= buffer_size(); + } + + // Should only be called from write thread + void ReserveMem(size_t mem) { + if (enabled()) { + memory_used_.fetch_add(mem, std::memory_order_relaxed); + } + } + void FreeMem(size_t mem) { + if (enabled()) { + memory_used_.fetch_sub(mem, std::memory_order_relaxed); + } + } + + private: + const size_t buffer_size_; + std::atomic memory_used_; + + // No copying allowed + WriteBufferManager(const WriteBufferManager&) = delete; + WriteBufferManager& operator=(const WriteBufferManager&) = delete; +}; +} // namespace rocksdb diff --git a/external/rocksdb/java/CMakeLists.txt b/external/rocksdb/java/CMakeLists.txt new file mode 100644 index 0000000000..5f2c942b45 --- /dev/null +++ b/external/rocksdb/java/CMakeLists.txt @@ -0,0 +1,165 @@ +cmake_minimum_required(VERSION 2.6) + +set(JNI_NATIVE_SOURCES + rocksjni/backupenginejni.cc + rocksjni/backupablejni.cc + rocksjni/checkpoint.cc + rocksjni/columnfamilyhandle.cc + rocksjni/compaction_filter.cc + rocksjni/comparator.cc + rocksjni/comparatorjnicallback.cc + rocksjni/env.cc + rocksjni/filter.cc + rocksjni/iterator.cc + rocksjni/loggerjnicallback.cc + rocksjni/memtablejni.cc + rocksjni/merge_operator.cc + rocksjni/options.cc + rocksjni/ratelimiterjni.cc + rocksjni/remove_emptyvalue_compactionfilterjni.cc + rocksjni/restorejni.cc + rocksjni/rocksjni.cc + rocksjni/slice.cc + rocksjni/snapshot.cc + rocksjni/statistics.cc + rocksjni/table.cc + rocksjni/transaction_log.cc + rocksjni/ttl.cc + rocksjni/write_batch.cc + rocksjni/writebatchhandlerjnicallback.cc + rocksjni/write_batch_with_index.cc + rocksjni/write_batch_test.cc +) + +set(NATIVE_JAVA_CLASSES + org.rocksdb.AbstractCompactionFilter + org.rocksdb.AbstractComparator + org.rocksdb.AbstractSlice + org.rocksdb.BackupEngine + org.rocksdb.BackupableDB + org.rocksdb.BackupableDBOptions + org.rocksdb.BlockBasedTableConfig + org.rocksdb.BloomFilter + org.rocksdb.Checkpoint + org.rocksdb.ColumnFamilyHandle + org.rocksdb.ColumnFamilyOptions + org.rocksdb.Comparator + org.rocksdb.ComparatorOptions + org.rocksdb.DBOptions + org.rocksdb.DirectComparator + org.rocksdb.DirectSlice + org.rocksdb.Env + org.rocksdb.FlushOptions + org.rocksdb.Filter + org.rocksdb.GenericRateLimiterConfig + org.rocksdb.HashLinkedListMemTableConfig + org.rocksdb.HashSkipListMemTableConfig + org.rocksdb.Logger + org.rocksdb.MergeOperator + org.rocksdb.Options + org.rocksdb.PlainTableConfig + org.rocksdb.ReadOptions + org.rocksdb.RemoveEmptyValueCompactionFilter + org.rocksdb.RestoreBackupableDB + org.rocksdb.RestoreOptions + org.rocksdb.RocksDB + org.rocksdb.RocksEnv + org.rocksdb.RocksIterator + org.rocksdb.RocksMemEnv + org.rocksdb.SkipListMemTableConfig + org.rocksdb.Slice + org.rocksdb.Statistics + org.rocksdb.TransactionLogIterator + org.rocksdb.TtlDB + org.rocksdb.VectorMemTableConfig + org.rocksdb.Snapshot + org.rocksdb.StringAppendOperator + org.rocksdb.WriteBatch + org.rocksdb.WriteBatch.Handler + org.rocksdb.WriteOptions + org.rocksdb.WriteBatchWithIndex + org.rocksdb.WBWIRocksIterator + org.rocksdb.WriteBatchTest + org.rocksdb.WriteBatchTestInternalHelper +) + +include_directories($ENV{JAVA_HOME}/include) +include_directories($ENV{JAVA_HOME}/include/win32) +include_directories(${PROJECT_SOURCE_DIR}/java) + +set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) +set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) +set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) +set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) +set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) +set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) +set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) +set(JAVA_TESTCLASSPATH "${JAVA_JUNIT_JAR}\;${JAVA_HAMCR_JAR}\;${JAVA_MOCKITO_JAR}\;${JAVA_CGLIB_JAR}\;${JAVA_ASSERTJ_JAR}") + +if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes) + execute_process(COMMAND mkdir ${PROJECT_SOURCE_DIR}/java/classes) +endif() + +if(NOT EXISTS ${JAVA_TEST_LIBDIR}) + execute_process(COMMAND mkdir ${JAVA_TEST_LIBDIR}) +endif() + +if(NOT EXISTS ${JAVA_JUNIT_JAR}) + message("Downloading ${JAVA_JUNIT_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=junit/junit/4.12/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_JUNIT_JAR}) +endif() +if(NOT EXISTS ${JAVA_HAMCR_JAR}) + message("Downloading ${JAVA_HAMCR_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_HAMCR_JAR}) +endif() +if(NOT EXISTS ${JAVA_MOCKITO_JAR}) + message("Downloading ${JAVA_MOCKITO_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_MOCKITO_JAR}) +endif() +if(NOT EXISTS ${JAVA_CGLIB_JAR}) + message("Downloading ${JAVA_CGLIB_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_CGLIB_JAR}) +endif() +if(NOT EXISTS ${JAVA_ASSERTJ_JAR}) + message("Downloading ${JAVA_ASSERTJ_JAR}") + file(DOWNLOAD http://central.maven.org/maven2/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR}) +endif() + +if(WIN32) + set(JAVAC cmd /c javac) + set(JAVAH cmd /c javah) +else() + set(JAVAC javac) + set(JAVAH javah) +endif() + +execute_process(COMMAND ${JAVAC} -cp ${JAVA_TESTCLASSPATH} -d ${PROJECT_SOURCE_DIR}/java/classes ${PROJECT_SOURCE_DIR}/java/src/main/java/org/rocksdb/util/*.java ${PROJECT_SOURCE_DIR}/java/src/main/java/org/rocksdb/*.java ${PROJECT_SOURCE_DIR}/java/src/test/java/org/rocksdb/*.java) +execute_process(COMMAND ${JAVAH} -cp ${PROJECT_SOURCE_DIR}/java/classes -d ${PROJECT_SOURCE_DIR}/java/include -jni ${NATIVE_JAVA_CLASSES}) +add_library(rocksdbjni${ARTIFACT_SUFFIX} SHARED ${JNI_NATIVE_SOURCES}) +set_target_properties(rocksdbjni${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdbjni${ARTIFACT_SUFFIX}.pdb") +target_link_libraries(rocksdbjni${ARTIFACT_SUFFIX} rocksdblib${ARTIFACT_SUFFIX} ${LIBS}) diff --git a/external/rocksdb/java/Makefile b/external/rocksdb/java/Makefile index abc8f73eec..dcfcc0d309 100644 --- a/external/rocksdb/java/Makefile +++ b/external/rocksdb/java/Makefile @@ -2,7 +2,6 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.AbstractComparator\ org.rocksdb.AbstractSlice\ org.rocksdb.BackupEngine\ - org.rocksdb.BackupableDB\ org.rocksdb.BackupableDBOptions\ org.rocksdb.BlockBasedTableConfig\ org.rocksdb.BloomFilter\ @@ -26,7 +25,6 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.PlainTableConfig\ org.rocksdb.ReadOptions\ org.rocksdb.RemoveEmptyValueCompactionFilter\ - org.rocksdb.RestoreBackupableDB\ org.rocksdb.RestoreOptions\ org.rocksdb.RocksDB\ org.rocksdb.RocksEnv\ @@ -62,8 +60,8 @@ endif JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.BackupEngineTest\ - org.rocksdb.BackupableDBTest\ org.rocksdb.BlockBasedTableConfigTest\ + org.rocksdb.util.BytewiseComparatorTest\ org.rocksdb.CheckPointTest\ org.rocksdb.ColumnFamilyOptionsTest\ org.rocksdb.ColumnFamilyTest\ @@ -99,6 +97,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.StatisticsCollectorTest\ org.rocksdb.WriteBatchHandlerTest\ org.rocksdb.WriteBatchTest\ + org.rocksdb.WriteBatchThreadedTest\ org.rocksdb.WriteOptionsTest\ org.rocksdb.WriteBatchWithIndexTest @@ -182,7 +181,9 @@ java_test: resolve_test_deps $(TEST_SRC)/org/rocksdb/*.java $(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES) -test: java resolve_test_deps java_test +test: java resolve_test_deps java_test run_test + +run_test: java -ea -Xcheck:jni -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS) db_bench: java diff --git a/external/rocksdb/java/RELEASE.md b/external/rocksdb/java/RELEASE.md index 084460c888..cb9aaf987b 100644 --- a/external/rocksdb/java/RELEASE.md +++ b/external/rocksdb/java/RELEASE.md @@ -15,7 +15,7 @@ Once you have these items, run this make command from RocksDB's root source dire This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. -You can find all native binaries and JARs in the java directory upon completion: +You can find all native binaries and JARs in the java/target directory upon completion: librocksdbjni-linux32.so librocksdbjni-linux64.so diff --git a/external/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/external/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java index 14eea09e93..1f805b1e20 100644 --- a/external/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/external/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,10 +21,14 @@ */ package org.rocksdb.benchmark; +import java.io.IOException; import java.lang.Runnable; import java.lang.Math; import java.io.File; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.nio.ByteBuffer; +import java.nio.file.Files; import java.util.Collection; import java.util.Date; import java.util.EnumMap; @@ -565,8 +569,7 @@ private void prepareOptions(Options options) throws RocksDBException { (Integer)flags_.get(Flag.bloom_locality)); options.setMinWriteBufferNumberToMerge( (Integer)flags_.get(Flag.min_write_buffer_number_to_merge)); - options.setMemtablePrefixBloomBits( - (Integer)flags_.get(Flag.memtable_bloom_bits)); + options.setMemtablePrefixBloomSizeRatio((Double) flags_.get(Flag.memtable_bloom_size_ratio)); options.setNumLevels( (Integer)flags_.get(Flag.num_levels)); options.setTargetFileSizeBase( @@ -595,12 +598,15 @@ private void prepareOptions(Options options) throws RocksDBException { (Boolean)flags_.get(Flag.disable_auto_compactions)); options.setSourceCompactionFactor( (Integer)flags_.get(Flag.source_compaction_factor)); - options.setFilterDeletes( - (Boolean)flags_.get(Flag.filter_deletes)); options.setMaxSuccessiveMerges( (Integer)flags_.get(Flag.max_successive_merges)); options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds)); options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB)); + if(flags_.get(Flag.java_comparator) != null) { + options.setComparator( + (AbstractComparator)flags_.get(Flag.java_comparator)); + } + /* TODO(yhchiang): enable the following parameters options.setCompressionType((String)flags_.get(Flag.compression_type)); options.setCompressionLevel((Integer)flags_.get(Flag.compression_level)); @@ -774,6 +780,7 @@ void printWarnings() { } private void open(Options options) throws RocksDBException { + System.out.println("Using database directory: " + databaseDir_); db_ = RocksDB.open(options, databaseDir_); } @@ -1185,10 +1192,10 @@ private enum Flag { return Integer.parseInt(value); } }, - memtable_bloom_bits(0,"Bloom filter bits per key for memtable.\n" + - "\tNegative means no bloom filter.") { + memtable_bloom_size_ratio(0, "Ratio of memtable used by the bloom filter.\n" + + "\t0 means no bloom filter.") { @Override public Object parseValue(String value) { - return Integer.parseInt(value); + return Double.parseDouble(value); } }, cache_numshardbits(-1,"Number of shards for the block cache\n" + @@ -1475,7 +1482,7 @@ private enum Flag { return Integer.parseInt(value); } }, - db("/tmp/rocksdbjni-bench", + db(getTempDir("rocksdb-jni"), "Use the db with the following name.") { @Override public Object parseValue(String value) { return value; @@ -1486,6 +1493,31 @@ private enum Flag { @Override public Object parseValue(String value) { return parseBoolean(value); } + }, + java_comparator(null, "Class name of a Java Comparator to use instead\n" + + "\tof the default C++ ByteWiseComparatorImpl. Must be available on\n" + + "\tthe classpath") { + @Override + protected Object parseValue(final String value) { + try { + final ComparatorOptions copt = new ComparatorOptions(); + final Class clsComparator = + (Class)Class.forName(value); + final Constructor cstr = + clsComparator.getConstructor(ComparatorOptions.class); + return cstr.newInstance(copt); + } catch(final ClassNotFoundException cnfe) { + throw new IllegalArgumentException("Java Comparator '" + value + "'" + + " not found on the classpath", cnfe); + } catch(final NoSuchMethodException nsme) { + throw new IllegalArgumentException("Java Comparator '" + value + "'" + + " does not have a public ComparatorOptions constructor", nsme); + } catch(final IllegalAccessException | InstantiationException + | InvocationTargetException ie) { + throw new IllegalArgumentException("Unable to construct Java" + + " Comparator '" + value + "'", ie); + } + } }; private Flag(Object defaultValue, String desc) { @@ -1516,6 +1548,18 @@ public boolean parseBoolean(String value) { private final String desc_; } + private final static String DEFAULT_TEMP_DIR = "/tmp"; + + private static String getTempDir(final String dirName) { + try { + return Files.createTempDirectory(dirName).toAbsolutePath().toString(); + } catch(final IOException ioe) { + System.err.println("Unable to create temp directory, defaulting to: " + + DEFAULT_TEMP_DIR); + return DEFAULT_TEMP_DIR + File.pathSeparator + dirName; + } + } + private static class RandomGenerator { private final byte[] data_; private int dataLength_; diff --git a/external/rocksdb/java/crossbuild/Vagrantfile b/external/rocksdb/java/crossbuild/Vagrantfile index 8a52b92612..21cce1201a 100644 --- a/external/rocksdb/java/crossbuild/Vagrantfile +++ b/external/rocksdb/java/crossbuild/Vagrantfile @@ -20,7 +20,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| end config.vm.provision :shell, path: "build-linux-centos.sh" - config.vm.synced_folder "../", "/rocksdb-build" + config.vm.synced_folder "../target", "/rocksdb-build" config.vm.synced_folder "../..", "/rocksdb", type: "rsync" config.vm.boot_timeout = 1200 end diff --git a/external/rocksdb/java/crossbuild/build-linux-centos.sh b/external/rocksdb/java/crossbuild/build-linux-centos.sh index f2b79480d5..2e8f81d94b 100644 --- a/external/rocksdb/java/crossbuild/build-linux-centos.sh +++ b/external/rocksdb/java/crossbuild/build-linux-centos.sh @@ -1,15 +1,15 @@ #!/usr/bin/env bash # install all required packages for rocksdb that are available through yum ARCH=$(uname -i) -sudo yum -y install openssl java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel +sudo yum -y install openssl java-1.7.0-openjdk-devel.$ARCH # install gcc/g++ 4.8.2 via CERN (http://linux.web.cern.ch/linux/devtoolset/) sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern sudo yum -y install devtoolset-2 -wget http://gflags.googlecode.com/files/gflags-1.6.tar.gz -tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install +wget http://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz +tar xvfz gflags-2.0-no-svn-files.tar.gz; cd gflags-2.0; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib # set java home so we can build rocksdb jars @@ -18,7 +18,7 @@ export JAVA_HOME=/usr/lib/jvm/java-1.7.0 # build rocksdb cd /rocksdb scl enable devtoolset-2 'make jclean clean' -scl enable devtoolset-2 'make rocksdbjavastatic' +scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic' cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build diff --git a/external/rocksdb/java/rocksjni.pom b/external/rocksdb/java/rocksjni.pom index 0512df63eb..0251d8f364 100644 --- a/external/rocksdb/java/rocksjni.pom +++ b/external/rocksdb/java/rocksjni.pom @@ -102,7 +102,7 @@ Xenu - String fileContents = new File("${project.basedir}/../include/rocksdb/version.h").getText('UTF-8') + String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8') matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/) String major_version = matcher.getAt(0).getAt(1) matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/) diff --git a/external/rocksdb/java/rocksjni/backupablejni.cc b/external/rocksdb/java/rocksjni/backupablejni.cc index d26e46e88e..d9976386ca 100644 --- a/external/rocksdb/java/rocksjni/backupablejni.cc +++ b/external/rocksdb/java/rocksjni/backupablejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,142 +13,24 @@ #include #include -#include "include/org_rocksdb_BackupableDB.h" #include "include/org_rocksdb_BackupableDBOptions.h" #include "rocksjni/portal.h" #include "rocksdb/utilities/backupable_db.h" -/* - * Class: org_rocksdb_BackupableDB - * Method: open - * Signature: (JJ)V - */ -void Java_org_rocksdb_BackupableDB_open( - JNIEnv* env, jobject jbdb, jlong jdb_handle, jlong jopt_handle) { - auto db = reinterpret_cast(jdb_handle); - auto opt = reinterpret_cast(jopt_handle); - auto bdb = new rocksdb::BackupableDB(db, *opt); - - // as BackupableDB extends RocksDB on the java side, we can reuse - // the RocksDB portal here. - rocksdb::RocksDBJni::setHandle(env, jbdb, bdb); -} - -/* - * Class: org_rocksdb_BackupableDB - * Method: createNewBackup - * Signature: (JZ)V - */ -void Java_org_rocksdb_BackupableDB_createNewBackup( - JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jflag) { - rocksdb::Status s = - reinterpret_cast(jhandle)->CreateNewBackup(jflag); - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_BackupableDB - * Method: purgeOldBackups - * Signature: (JI)V - */ -void Java_org_rocksdb_BackupableDB_purgeOldBackups( - JNIEnv* env, jobject jbdb, jlong jhandle, jint jnumBackupsToKeep) { - rocksdb::Status s = - reinterpret_cast(jhandle)-> - PurgeOldBackups(jnumBackupsToKeep); - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_BackupableDB - * Method: deleteBackup0 - * Signature: (JI)V - */ -void Java_org_rocksdb_BackupableDB_deleteBackup0(JNIEnv* env, - jobject jobj, jlong jhandle, jint jbackup_id) { - auto rdb = reinterpret_cast(jhandle); - rocksdb::Status s = rdb->DeleteBackup(jbackup_id); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_BackupableDB - * Method: getBackupInfo - * Signature: (J)Ljava/util/List; - */ -jobject Java_org_rocksdb_BackupableDB_getBackupInfo( - JNIEnv* env, jobject jbdb, jlong jhandle) { - std::vector backup_infos; - reinterpret_cast(jhandle)-> - GetBackupInfo(&backup_infos); - return rocksdb::BackupInfoListJni::getBackupInfo(env, - backup_infos); -} - -/* - * Class: org_rocksdb_BackupableDB - * Method: getCorruptedBackups - * Signature: (J)[I; - */ -jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups( - JNIEnv* env, jobject jbdb, jlong jhandle) { - std::vector backup_ids; - reinterpret_cast(jhandle)-> - GetCorruptedBackups(&backup_ids); - // store backupids in int array - const std::vector::size_type - kIdSize = backup_ids.size(); - int int_backup_ids[kIdSize]; - for (std::vector::size_type i = 0; - i != kIdSize; i++) { - int_backup_ids[i] = backup_ids[i]; - } - // Store ints in java array - jintArray ret_backup_ids; - // Its ok to loose precision here (64->32) - jsize ret_backup_ids_size = static_cast(kIdSize); - ret_backup_ids = env->NewIntArray(ret_backup_ids_size); - env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, - int_backup_ids); - return ret_backup_ids; -} - -/* - * Class: org_rocksdb_BackupableDB - * Method: garbageCollect - * Signature: (J)V - */ -void Java_org_rocksdb_BackupableDB_garbageCollect(JNIEnv* env, - jobject jobj, jlong jhandle) { - auto db = reinterpret_cast(jhandle); - rocksdb::Status s = db->GarbageCollect(); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - /////////////////////////////////////////////////////////////////////////// // BackupDBOptions /* * Class: org_rocksdb_BackupableDBOptions * Method: newBackupableDBOptions - * Signature: (Ljava/lang/String;)V + * Signature: (Ljava/lang/String;)J */ -void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions( - JNIEnv* env, jobject jobj, jstring jpath) { - const char* cpath = env->GetStringUTFChars(jpath, 0); +jlong Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions( + JNIEnv* env, jclass jcls, jstring jpath) { + const char* cpath = env->GetStringUTFChars(jpath, NULL); auto bopt = new rocksdb::BackupableDBOptions(cpath); env->ReleaseStringUTFChars(jpath, cpath); - rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt); + return reinterpret_cast(bopt); } /* @@ -326,5 +208,4 @@ void Java_org_rocksdb_BackupableDBOptions_disposeInternal( auto bopt = reinterpret_cast(jhandle); assert(bopt); delete bopt; - rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr); } diff --git a/external/rocksdb/java/rocksjni/backupenginejni.cc b/external/rocksdb/java/rocksjni/backupenginejni.cc index 750ab965a5..a796d6e5be 100644 --- a/external/rocksdb/java/rocksjni/backupenginejni.cc +++ b/external/rocksdb/java/rocksjni/backupenginejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,10 +16,10 @@ /* * Class: org_rocksdb_BackupEngine * Method: open - * Signature: (JJ)V + * Signature: (JJ)J */ -void Java_org_rocksdb_BackupEngine_open( - JNIEnv* env, jobject jbe, jlong env_handle, +jlong Java_org_rocksdb_BackupEngine_open( + JNIEnv* env, jclass jcls, jlong env_handle, jlong backupable_db_options_handle) { auto* rocks_env = reinterpret_cast(env_handle); auto* backupable_db_options = @@ -30,11 +30,11 @@ void Java_org_rocksdb_BackupEngine_open( *backupable_db_options, &backup_engine); if (status.ok()) { - rocksdb::BackupEngineJni::setHandle(env, jbe, backup_engine); - return; + return reinterpret_cast(backup_engine); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + return 0; } - - rocksdb::RocksDBExceptionJni::ThrowNew(env, status); } /* @@ -81,20 +81,14 @@ jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups( std::vector backup_ids; backup_engine->GetCorruptedBackups(&backup_ids); // store backupids in int array - const std::vector::size_type - kIdSize = backup_ids.size(); - int int_backup_ids[kIdSize]; - for (std::vector::size_type i = 0; - i != kIdSize; i++) { - int_backup_ids[i] = backup_ids[i]; - } + std::vector int_backup_ids(backup_ids.begin(), backup_ids.end()); // Store ints in java array jintArray ret_backup_ids; // Its ok to loose precision here (64->32) - jsize ret_backup_ids_size = static_cast(kIdSize); + jsize ret_backup_ids_size = static_cast(backup_ids.size()); ret_backup_ids = env->NewIntArray(ret_backup_ids_size); env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, - int_backup_ids); + int_backup_ids.data()); return ret_backup_ids; } diff --git a/external/rocksdb/java/rocksjni/checkpoint.cc b/external/rocksdb/java/rocksjni/checkpoint.cc index 72a40be007..45f0fde6b8 100644 --- a/external/rocksdb/java/rocksjni/checkpoint.cc +++ b/external/rocksdb/java/rocksjni/checkpoint.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/columnfamilyhandle.cc b/external/rocksdb/java/rocksjni/columnfamilyhandle.cc index be3b4c82fd..2a874b1d90 100644 --- a/external/rocksdb/java/rocksjni/columnfamilyhandle.cc +++ b/external/rocksdb/java/rocksjni/columnfamilyhandle.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/compaction_filter.cc b/external/rocksdb/java/rocksjni/compaction_filter.cc index 5fa52c0dc9..6cc6c7732a 100644 --- a/external/rocksdb/java/rocksjni/compaction_filter.cc +++ b/external/rocksdb/java/rocksjni/compaction_filter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,6 +8,7 @@ #include +#include "include/org_rocksdb_AbstractCompactionFilter.h" #include "rocksdb/compaction_filter.h" // diff --git a/external/rocksdb/java/rocksjni/comparator.cc b/external/rocksdb/java/rocksjni/comparator.cc index 1963762355..dd11f10e4c 100644 --- a/external/rocksdb/java/rocksjni/comparator.cc +++ b/external/rocksdb/java/rocksjni/comparator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -36,15 +36,15 @@ void Java_org_rocksdb_AbstractComparator_disposeInternal( /* * Class: org_rocksdb_Comparator * Method: createNewComparator0 - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_Comparator_createNewComparator0( +jlong Java_org_rocksdb_Comparator_createNewComparator0( JNIEnv* env, jobject jobj, jlong copt_handle) { const rocksdb::ComparatorJniCallbackOptions* copt = reinterpret_cast(copt_handle); const rocksdb::ComparatorJniCallback* c = new rocksdb::ComparatorJniCallback(env, jobj, copt); - rocksdb::AbstractComparatorJni::setHandle(env, jobj, c); + return reinterpret_cast(c); } // @@ -53,14 +53,14 @@ void Java_org_rocksdb_Comparator_createNewComparator0( /* * Class: org_rocksdb_DirectComparator * Method: createNewDirectComparator0 - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_DirectComparator_createNewDirectComparator0( +jlong Java_org_rocksdb_DirectComparator_createNewDirectComparator0( JNIEnv* env, jobject jobj, jlong copt_handle) { const rocksdb::ComparatorJniCallbackOptions* copt = reinterpret_cast(copt_handle); const rocksdb::DirectComparatorJniCallback* c = new rocksdb::DirectComparatorJniCallback(env, jobj, copt); - rocksdb::AbstractComparatorJni::setHandle(env, jobj, c); + return reinterpret_cast(c); } // diff --git a/external/rocksdb/java/rocksjni/comparatorjnicallback.cc b/external/rocksdb/java/rocksjni/comparatorjnicallback.cc index a85b450855..57ee0f95cf 100644 --- a/external/rocksdb/java/rocksjni/comparatorjnicallback.cc +++ b/external/rocksdb/java/rocksjni/comparatorjnicallback.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -60,8 +60,8 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const { // performance. mtx_compare->Lock(); - AbstractSliceJni::setHandle(m_env, m_jSliceA, &a); - AbstractSliceJni::setHandle(m_env, m_jSliceB, &b); + AbstractSliceJni::setHandle(m_env, m_jSliceA, &a, JNI_FALSE); + AbstractSliceJni::setHandle(m_env, m_jSliceB, &b, JNI_FALSE); jint result = m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA, m_jSliceB); @@ -89,7 +89,7 @@ void BaseComparatorJniCallback::FindShortestSeparator( // performance. mtx_findShortestSeparator->Lock(); - AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit); + AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit, JNI_FALSE); jstring jsResultStart = (jstring)m_env->CallObjectMethod(m_jComparator, m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit); diff --git a/external/rocksdb/java/rocksjni/comparatorjnicallback.h b/external/rocksdb/java/rocksjni/comparatorjnicallback.h index 65b986ca43..821a91e451 100644 --- a/external/rocksdb/java/rocksjni/comparatorjnicallback.h +++ b/external/rocksdb/java/rocksjni/comparatorjnicallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/env.cc b/external/rocksdb/java/rocksjni/env.cc index b50d5ae30d..a58f54ea7b 100644 --- a/external/rocksdb/java/rocksjni/env.cc +++ b/external/rocksdb/java/rocksjni/env.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/filter.cc b/external/rocksdb/java/rocksjni/filter.cc index 2ce17d4990..96ef9856bc 100644 --- a/external/rocksdb/java/rocksjni/filter.cc +++ b/external/rocksdb/java/rocksjni/filter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,17 +19,17 @@ /* * Class: org_rocksdb_BloomFilter * Method: createBloomFilter - * Signature: (IZ)V + * Signature: (IZ)J */ -void Java_org_rocksdb_BloomFilter_createNewBloomFilter( - JNIEnv* env, jobject jobj, jint bits_per_key, +jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter( + JNIEnv* env, jclass jcls, jint bits_per_key, jboolean use_block_base_builder) { - rocksdb::FilterPolicy* fp = const_cast( + auto* fp = const_cast( rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder)); - std::shared_ptr *pFilterPolicy = + auto* pFilterPolicy = new std::shared_ptr; *pFilterPolicy = std::shared_ptr(fp); - rocksdb::FilterJni::setHandle(env, jobj, pFilterPolicy); + return reinterpret_cast(pFilterPolicy); } /* diff --git a/external/rocksdb/java/rocksjni/iterator.cc b/external/rocksdb/java/rocksjni/iterator.cc index e9eb0bb37c..c5e64adfbe 100644 --- a/external/rocksdb/java/rocksjni/iterator.cc +++ b/external/rocksdb/java/rocksjni/iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/loggerjnicallback.cc b/external/rocksdb/java/rocksjni/loggerjnicallback.cc index 71e50b9a98..42e97015ee 100644 --- a/external/rocksdb/java/rocksjni/loggerjnicallback.cc +++ b/external/rocksdb/java/rocksjni/loggerjnicallback.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,6 +22,30 @@ LoggerJniCallback::LoggerJniCallback( // across multiple method calls, so we create a global ref m_jLogger = env->NewGlobalRef(jlogger); m_jLogMethodId = LoggerJni::getLogMethodId(env); + + jobject jdebug_level = InfoLogLevelJni::DEBUG_LEVEL(env); + assert(jdebug_level != nullptr); + m_jdebug_level = env->NewGlobalRef(jdebug_level); + + jobject jinfo_level = InfoLogLevelJni::INFO_LEVEL(env); + assert(jinfo_level != nullptr); + m_jinfo_level = env->NewGlobalRef(jinfo_level); + + jobject jwarn_level = InfoLogLevelJni::WARN_LEVEL(env); + assert(jwarn_level != nullptr); + m_jwarn_level = env->NewGlobalRef(jwarn_level); + + jobject jerror_level = InfoLogLevelJni::ERROR_LEVEL(env); + assert(jerror_level != nullptr); + m_jerror_level = env->NewGlobalRef(jerror_level); + + jobject jfatal_level = InfoLogLevelJni::FATAL_LEVEL(env); + assert(jfatal_level != nullptr); + m_jfatal_level = env->NewGlobalRef(jfatal_level); + + jobject jheader_level = InfoLogLevelJni::HEADER_LEVEL(env); + assert(jheader_level != nullptr); + m_jheader_level = env->NewGlobalRef(jheader_level); } /** @@ -43,25 +67,30 @@ void LoggerJniCallback::Logv(const char* format, va_list ap) { void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format, va_list ap) { if (GetInfoLogLevel() <= log_level) { - JNIEnv* env = getJniEnv(); // determine InfoLogLevel java enum instance jobject jlog_level; switch (log_level) { case rocksdb::InfoLogLevel::DEBUG_LEVEL: - jlog_level = InfoLogLevelJni::DEBUG_LEVEL(env); + jlog_level = m_jdebug_level; break; case rocksdb::InfoLogLevel::INFO_LEVEL: - jlog_level = InfoLogLevelJni::INFO_LEVEL(env); + jlog_level = m_jinfo_level; + break; + case rocksdb::InfoLogLevel::WARN_LEVEL: + jlog_level = m_jwarn_level; break; case rocksdb::InfoLogLevel::ERROR_LEVEL: - jlog_level = InfoLogLevelJni::ERROR_LEVEL(env); + jlog_level = m_jerror_level; break; case rocksdb::InfoLogLevel::FATAL_LEVEL: - jlog_level = InfoLogLevelJni::FATAL_LEVEL(env); + jlog_level = m_jfatal_level; + break; + case rocksdb::InfoLogLevel::HEADER_LEVEL: + jlog_level = m_jheader_level; break; default: - jlog_level = InfoLogLevelJni::FATAL_LEVEL(env); + jlog_level = m_jfatal_level; break; } @@ -98,6 +127,8 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level, assert(p < limit); *p++ = '\0'; + JNIEnv* env = getJniEnv(); + // pass java string to callback handler env->CallVoidMethod( m_jLogger, @@ -117,6 +148,14 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level, LoggerJniCallback::~LoggerJniCallback() { JNIEnv* env = getJniEnv(); env->DeleteGlobalRef(m_jLogger); + + env->DeleteGlobalRef(m_jdebug_level); + env->DeleteGlobalRef(m_jinfo_level); + env->DeleteGlobalRef(m_jwarn_level); + env->DeleteGlobalRef(m_jerror_level); + env->DeleteGlobalRef(m_jfatal_level); + env->DeleteGlobalRef(m_jheader_level); + m_jvm->DetachCurrentThread(); } @@ -125,9 +164,9 @@ LoggerJniCallback::~LoggerJniCallback() { /* * Class: org_rocksdb_Logger * Method: createNewLoggerOptions - * Signature: (J)V + * Signature: (J)J */ -void Java_org_rocksdb_Logger_createNewLoggerOptions( +jlong Java_org_rocksdb_Logger_createNewLoggerOptions( JNIEnv* env, jobject jobj, jlong joptions) { rocksdb::LoggerJniCallback* c = new rocksdb::LoggerJniCallback(env, jobj); @@ -137,15 +176,15 @@ void Java_org_rocksdb_Logger_createNewLoggerOptions( std::shared_ptr *pLoggerJniCallback = new std::shared_ptr; *pLoggerJniCallback = std::shared_ptr(c); - rocksdb::LoggerJni::setHandle(env, jobj, pLoggerJniCallback); + return reinterpret_cast(pLoggerJniCallback); } /* * Class: org_rocksdb_Logger * Method: createNewLoggerDbOptions - * Signature: (J)V + * Signature: (J)J */ -void Java_org_rocksdb_Logger_createNewLoggerDbOptions( +jlong Java_org_rocksdb_Logger_createNewLoggerDbOptions( JNIEnv* env, jobject jobj, jlong jdb_options) { rocksdb::LoggerJniCallback* c = new rocksdb::LoggerJniCallback(env, jobj); @@ -155,7 +194,7 @@ void Java_org_rocksdb_Logger_createNewLoggerDbOptions( std::shared_ptr *pLoggerJniCallback = new std::shared_ptr; *pLoggerJniCallback = std::shared_ptr(c); - rocksdb::LoggerJni::setHandle(env, jobj, pLoggerJniCallback); + return reinterpret_cast(pLoggerJniCallback); } /* diff --git a/external/rocksdb/java/rocksjni/loggerjnicallback.h b/external/rocksdb/java/rocksjni/loggerjnicallback.h index 3936252bca..9a7605d24f 100644 --- a/external/rocksdb/java/rocksjni/loggerjnicallback.h +++ b/external/rocksdb/java/rocksjni/loggerjnicallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -38,6 +38,12 @@ namespace rocksdb { JavaVM* m_jvm; jobject m_jLogger; jmethodID m_jLogMethodId; + jobject m_jdebug_level; + jobject m_jinfo_level; + jobject m_jwarn_level; + jobject m_jerror_level; + jobject m_jfatal_level; + jobject m_jheader_level; }; } // namespace rocksdb diff --git a/external/rocksdb/java/rocksjni/memtablejni.cc b/external/rocksdb/java/rocksjni/memtablejni.cc index ce27f97699..ead038d50b 100644 --- a/external/rocksdb/java/rocksjni/memtablejni.cc +++ b/external/rocksdb/java/rocksjni/memtablejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/options.cc b/external/rocksdb/java/rocksjni/options.cc index 216fa5e8af..755c56d22c 100644 --- a/external/rocksdb/java/rocksjni/options.cc +++ b/external/rocksdb/java/rocksjni/options.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,7 +8,6 @@ #include #include #include -#include #include #include "include/org_rocksdb_Options.h" @@ -37,25 +36,25 @@ /* * Class: org_rocksdb_Options * Method: newOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jclass jcls) { rocksdb::Options* op = new rocksdb::Options(); - rocksdb::OptionsJni::setHandle(env, jobj, op); + return reinterpret_cast(op); } /* * Class: org_rocksdb_Options * Method: newOptions - * Signature: (JJ)V + * Signature: (JJ)J */ -void Java_org_rocksdb_Options_newOptions__JJ(JNIEnv* env, jobject jobj, +jlong Java_org_rocksdb_Options_newOptions__JJ(JNIEnv* env, jclass jcls, jlong jdboptions, jlong jcfoptions) { - auto dbOpt = reinterpret_cast(jdboptions); - auto cfOpt = reinterpret_cast( + auto* dbOpt = reinterpret_cast(jdboptions); + auto* cfOpt = reinterpret_cast( jcfoptions); rocksdb::Options* op = new rocksdb::Options(*dbOpt, *cfOpt); - rocksdb::OptionsJni::setHandle(env, jobj, op); + return reinterpret_cast(op); } /* @@ -576,6 +575,33 @@ void Java_org_rocksdb_Options_setKeepLogFileNum( } } +/* + * Class: org_rocksdb_Options + * Method: recycleLogFiles + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv* env, jobject jobj, + jlong jhandle) { + return reinterpret_cast(jhandle)->recycle_log_file_num; +} + +/* + * Class: org_rocksdb_Options + * Method: setRecycleLogFiles + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setRecycleLogFiles(JNIEnv* env, jobject jobj, + jlong jhandle, + jlong recycle_log_file_num) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num); + if (s.ok()) { + reinterpret_cast(jhandle)->recycle_log_file_num = + recycle_log_file_num; + } else { + rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s); + } +} + /* * Class: org_rocksdb_Options * Method: maxManifestFileSize @@ -1055,21 +1081,20 @@ jbyte Java_org_rocksdb_Options_compressionType( * vector. */ std::vector rocksdb_compression_vector_helper( - JNIEnv* env, jobject jcompressionLevels) { + JNIEnv* env, jbyteArray jcompressionLevels) { std::vector compressionLevels; - // iterate over compressionLevels - jobject iteratorObj = env->CallObjectMethod( - jcompressionLevels, rocksdb::ListJni::getIteratorMethod(env)); - while (env->CallBooleanMethod( - iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - // get compression - jobject jcompression_obj = env->CallObjectMethod(iteratorObj, - rocksdb::ListJni::getNextMethod(env)); - jbyte jcompression = env->CallByteMethod(jcompression_obj, - rocksdb::ByteJni::getByteValueMethod(env)); - compressionLevels.push_back(static_cast( - jcompression)); + + jsize len = env->GetArrayLength(jcompressionLevels); + jbyte* jcompressionLevel = env->GetByteArrayElements(jcompressionLevels, + NULL); + for(int i = 0; i < len; i++) { + jbyte jcl; + jcl = jcompressionLevel[i]; + compressionLevels.push_back(static_cast(jcl)); } + env->ReleaseByteArrayElements(jcompressionLevels, jcompressionLevel, + JNI_ABORT); + return compressionLevels; } @@ -1077,34 +1102,30 @@ std::vector rocksdb_compression_vector_helper( * Helper method to convert a CompressionType vector to a Java * List. */ -jobject rocksdb_compression_list_helper(JNIEnv* env, +jbyteArray rocksdb_compression_list_helper(JNIEnv* env, std::vector compressionLevels) { - jclass jListClazz = env->FindClass("java/util/ArrayList"); - jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( - env, jListClazz); - jobject jcompressionLevels = env->NewObject(jListClazz, - midList, compressionLevels.size()); - // insert in java list + std::unique_ptr jbuf = + std::unique_ptr(new jbyte[compressionLevels.size()]); for (std::vector::size_type i = 0; i != compressionLevels.size(); i++) { - jclass jByteClazz = env->FindClass("java/lang/Byte"); - jmethodID midByte = env->GetMethodID(jByteClazz, "", "(B)V"); - jobject obj = env->NewObject(jByteClazz, midByte, - compressionLevels[i]); - env->CallBooleanMethod(jcompressionLevels, - rocksdb::ListJni::getListAddMethodId(env), obj); + jbuf[i] = compressionLevels[i]; } + // insert in java array + jbyteArray jcompressionLevels = env->NewByteArray( + static_cast(compressionLevels.size())); + env->SetByteArrayRegion(jcompressionLevels, 0, + static_cast(compressionLevels.size()), jbuf.get()); return jcompressionLevels; } /* * Class: org_rocksdb_Options * Method: setCompressionPerLevel - * Signature: (JLjava/util/List;)V + * Signature: (J[B)V */ void Java_org_rocksdb_Options_setCompressionPerLevel( JNIEnv* env, jobject jobj, jlong jhandle, - jobject jcompressionLevels) { + jbyteArray jcompressionLevels) { auto* options = reinterpret_cast(jhandle); std::vector compressionLevels = rocksdb_compression_vector_helper(env, jcompressionLevels); @@ -1114,9 +1135,9 @@ void Java_org_rocksdb_Options_setCompressionPerLevel( /* * Class: org_rocksdb_Options * Method: compressionPerLevel - * Signature: (J)Ljava/util/List; + * Signature: (J)[B */ -jobject Java_org_rocksdb_Options_compressionPerLevel( +jbyteArray Java_org_rocksdb_Options_compressionPerLevel( JNIEnv* env, jobject jobj, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return rocksdb_compression_list_helper(env, @@ -1153,7 +1174,7 @@ jbyte Java_org_rocksdb_Options_compactionStyle( void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) { reinterpret_cast(jhandle)->compaction_options_fifo.max_table_files_size = - static_cast(jmax_table_files_size); + static_cast(jmax_table_files_size); } /* @@ -1612,27 +1633,6 @@ void Java_org_rocksdb_Options_setVerifyChecksumsInCompaction( static_cast(jverify_checksums_in_compaction); } -/* - * Class: org_rocksdb_Options - * Method: filterDeletes - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_filterDeletes( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->filter_deletes; -} - -/* - * Class: org_rocksdb_Options - * Method: setFilterDeletes - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setFilterDeletes( - JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) { - reinterpret_cast(jhandle)->filter_deletes = - static_cast(jfilter_deletes); -} - /* * Class: org_rocksdb_Options * Method: maxSequentialSkipInIterations @@ -1712,50 +1712,27 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( /* * Class: org_rocksdb_Options - * Method: memtablePrefixBloomBits + * Method: memtablePrefixBloomSizeRatio * Signature: (J)I */ -jint Java_org_rocksdb_Options_memtablePrefixBloomBits( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast( - jhandle)->memtable_prefix_bloom_bits; -} - -/* - * Class: org_rocksdb_Options - * Method: setMemtablePrefixBloomBits - * Signature: (JI)V - */ -void Java_org_rocksdb_Options_setMemtablePrefixBloomBits( - JNIEnv* env, jobject jobj, jlong jhandle, - jint jmemtable_prefix_bloom_bits) { - reinterpret_cast( - jhandle)->memtable_prefix_bloom_bits = - static_cast(jmemtable_prefix_bloom_bits); -} - -/* - * Class: org_rocksdb_Options - * Method: memtablePrefixBloomProbes - * Signature: (J)I - */ -jint Java_org_rocksdb_Options_memtablePrefixBloomProbes( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast( - jhandle)->memtable_prefix_bloom_probes; +jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv* env, + jobject jobj, + jlong jhandle) { + return reinterpret_cast(jhandle) + ->memtable_prefix_bloom_size_ratio; } /* * Class: org_rocksdb_Options - * Method: setMemtablePrefixBloomProbes + * Method: setMemtablePrefixBloomSizeRatio * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMemtablePrefixBloomProbes( +void Java_org_rocksdb_Options_setMemtablePrefixBloomSizeRatio( JNIEnv* env, jobject jobj, jlong jhandle, - jint jmemtable_prefix_bloom_probes) { - reinterpret_cast( - jhandle)->memtable_prefix_bloom_probes = - static_cast(jmemtable_prefix_bloom_probes); + jdouble jmemtable_prefix_bloom_size_ratio) { + reinterpret_cast(jhandle) + ->memtable_prefix_bloom_size_ratio = + static_cast(jmemtable_prefix_bloom_size_ratio); } /* @@ -1906,12 +1883,12 @@ void Java_org_rocksdb_Options_prepareForBulkLoad( /* * Class: org_rocksdb_ColumnFamilyOptions * Method: newColumnFamilyOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions( + JNIEnv* env, jclass jcls) { rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions(); - rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op); + return reinterpret_cast(op); } /* @@ -2046,7 +2023,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator( * Method: setCompactionFilterHandle * Signature: (JJ)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle__JJ( +void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle( JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcompactionfilter_handle) { reinterpret_cast(jopt_handle)-> @@ -2259,11 +2236,11 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType( /* * Class: org_rocksdb_ColumnFamilyOptions * Method: setCompressionPerLevel - * Signature: (JLjava/util/List;)V + * Signature: (J[B)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel( JNIEnv* env, jobject jobj, jlong jhandle, - jobject jcompressionLevels) { + jbyteArray jcompressionLevels) { auto* options = reinterpret_cast(jhandle); std::vector compressionLevels = rocksdb_compression_vector_helper(env, jcompressionLevels); @@ -2273,9 +2250,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel( /* * Class: org_rocksdb_ColumnFamilyOptions * Method: compressionPerLevel - * Signature: (J)Ljava/util/List; + * Signature: (J)[B */ -jobject Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel( +jbyteArray Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel( JNIEnv* env, jobject jobj, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return rocksdb_compression_list_helper(env, @@ -2312,7 +2289,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle( void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) { reinterpret_cast(jhandle)->compaction_options_fifo.max_table_files_size = - static_cast(jmax_table_files_size); + static_cast(jmax_table_files_size); } /* @@ -2794,28 +2771,6 @@ void Java_org_rocksdb_ColumnFamilyOptions_setVerifyChecksumsInCompaction( static_cast(jverify_checksums_in_compaction); } -/* - * Class: org_rocksdb_ColumnFamilyOptions - * Method: filterDeletes - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_ColumnFamilyOptions_filterDeletes( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)-> - filter_deletes; -} - -/* - * Class: org_rocksdb_ColumnFamilyOptions - * Method: setFilterDeletes - * Signature: (JZ)V - */ -void Java_org_rocksdb_ColumnFamilyOptions_setFilterDeletes( - JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) { - reinterpret_cast(jhandle)->filter_deletes = - static_cast(jfilter_deletes); -} - /* * Class: org_rocksdb_ColumnFamilyOptions * Method: maxSequentialSkipInIterations @@ -2895,50 +2850,26 @@ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks( /* * Class: org_rocksdb_ColumnFamilyOptions - * Method: memtablePrefixBloomBits - * Signature: (J)I - */ -jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomBits( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast( - jhandle)->memtable_prefix_bloom_bits; -} - -/* - * Class: org_rocksdb_ColumnFamilyOptions - * Method: setMemtablePrefixBloomBits - * Signature: (JI)V - */ -void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomBits( - JNIEnv* env, jobject jobj, jlong jhandle, - jint jmemtable_prefix_bloom_bits) { - reinterpret_cast( - jhandle)->memtable_prefix_bloom_bits = - static_cast(jmemtable_prefix_bloom_bits); -} - -/* - * Class: org_rocksdb_ColumnFamilyOptions - * Method: memtablePrefixBloomProbes + * Method: memtablePrefixBloomSizeRatio * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomProbes( +jdouble Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomSizeRatio( JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast( - jhandle)->memtable_prefix_bloom_probes; + return reinterpret_cast(jhandle) + ->memtable_prefix_bloom_size_ratio; } /* * Class: org_rocksdb_ColumnFamilyOptions - * Method: setMemtablePrefixBloomProbes + * Method: setMemtablePrefixBloomSizeRatio * Signature: (JI)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomProbes( +void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomSizeRatio( JNIEnv* env, jobject jobj, jlong jhandle, - jint jmemtable_prefix_bloom_probes) { - reinterpret_cast( - jhandle)->memtable_prefix_bloom_probes = - static_cast(jmemtable_prefix_bloom_probes); + jdouble jmemtable_prefix_bloom_size_ratio) { + reinterpret_cast(jhandle) + ->memtable_prefix_bloom_size_ratio = + static_cast(jmemtable_prefix_bloom_size_ratio); } /* @@ -3046,12 +2977,12 @@ void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits( /* * Class: org_rocksdb_DBOptions * Method: newDBOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env, - jobject jobj) { +jlong Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env, + jclass jcls) { rocksdb::DBOptions* dbop = new rocksdb::DBOptions(); - rocksdb::DBOptionsJni::setHandle(env, jobj, dbop); + return reinterpret_cast(dbop); } /* @@ -3533,6 +3464,32 @@ jlong Java_org_rocksdb_DBOptions_keepLogFileNum( return reinterpret_cast(jhandle)->keep_log_file_num; } +/* + * Class: org_rocksdb_DBOptions + * Method: setRecycleLogFiles + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setRecycleLogFileNum( + JNIEnv* env, jobject jobj, jlong jhandle, jlong recycle_log_file_num) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num); + if (s.ok()) { + reinterpret_cast(jhandle)->recycle_log_file_num = + recycle_log_file_num; + } else { + rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_DBOptions + * Method: recycleLogFiles + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv* env, jobject jobj, + jlong jhandle) { + return reinterpret_cast(jhandle)->recycle_log_file_num; +} + /* * Class: org_rocksdb_DBOptions * Method: setMaxManifestFileSize @@ -3820,12 +3777,12 @@ jlong Java_org_rocksdb_DBOptions_bytesPerSync( /* * Class: org_rocksdb_WriteOptions * Method: newWriteOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_WriteOptions_newWriteOptions( - JNIEnv* env, jobject jwrite_options) { +jlong Java_org_rocksdb_WriteOptions_newWriteOptions( + JNIEnv* env, jclass jcls) { rocksdb::WriteOptions* op = new rocksdb::WriteOptions(); - rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op); + return reinterpret_cast(op); } /* @@ -3837,8 +3794,6 @@ void Java_org_rocksdb_WriteOptions_disposeInternal( JNIEnv* env, jobject jwrite_options, jlong jhandle) { auto write_options = reinterpret_cast(jhandle); delete write_options; - - rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr); } /* @@ -3887,12 +3842,12 @@ jboolean Java_org_rocksdb_WriteOptions_disableWAL( /* * Class: org_rocksdb_ReadOptions * Method: newReadOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_ReadOptions_newReadOptions( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_ReadOptions_newReadOptions( + JNIEnv* env, jclass jcls) { auto read_opt = new rocksdb::ReadOptions(); - rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt); + return reinterpret_cast(read_opt); } /* @@ -3903,7 +3858,6 @@ void Java_org_rocksdb_ReadOptions_newReadOptions( void Java_org_rocksdb_ReadOptions_disposeInternal( JNIEnv* env, jobject jobj, jlong jhandle) { delete reinterpret_cast(jhandle); - rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr); } /* @@ -3971,6 +3925,90 @@ jboolean Java_org_rocksdb_ReadOptions_tailing( return reinterpret_cast(jhandle)->tailing; } +/* + * Class: org_rocksdb_ReadOptions + * Method: managed + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_managed( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->managed; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setManaged + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setManaged( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jmanaged) { + reinterpret_cast(jhandle)->managed = + static_cast(jmanaged); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: totalOrderSeek + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->total_order_seek; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setTotalOrderSeek + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setTotalOrderSeek( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtotal_order_seek) { + reinterpret_cast(jhandle)->total_order_seek = + static_cast(jtotal_order_seek); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: prefixSameAsStart + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->prefix_same_as_start; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setPrefixSameAsStart + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jprefix_same_as_start) { + reinterpret_cast(jhandle)->prefix_same_as_start = + static_cast(jprefix_same_as_start); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: pinData + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_pinData( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->pin_data; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setPinData + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setPinData( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jpin_data) { + reinterpret_cast(jhandle)->pin_data = + static_cast(jpin_data); +} + /* * Class: org_rocksdb_ReadOptions * Method: setSnapshot @@ -3994,18 +4032,40 @@ jlong Java_org_rocksdb_ReadOptions_snapshot( return reinterpret_cast(snapshot); } +/* + * Class: org_rocksdb_ReadOptions + * Method: readTier + * Signature: (J)B + */ +jbyte Java_org_rocksdb_ReadOptions_readTier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return static_cast( + reinterpret_cast(jhandle)->read_tier); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setReadTier + * Signature: (JB)V + */ +void Java_org_rocksdb_ReadOptions_setReadTier( + JNIEnv* env, jobject jobj, jlong jhandle, jbyte jread_tier) { + reinterpret_cast(jhandle)->read_tier = + static_cast(jread_tier); +} + ///////////////////////////////////////////////////////////////////// // rocksdb::ComparatorOptions /* * Class: org_rocksdb_ComparatorOptions * Method: newComparatorOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_ComparatorOptions_newComparatorOptions( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions( + JNIEnv* env, jclass jcls) { auto comparator_opt = new rocksdb::ComparatorJniCallbackOptions(); - rocksdb::ComparatorOptionsJni::setHandle(env, jobj, comparator_opt); + return reinterpret_cast(comparator_opt); } /* @@ -4038,7 +4098,6 @@ void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex( void Java_org_rocksdb_ComparatorOptions_disposeInternal( JNIEnv * env, jobject jobj, jlong jhandle) { delete reinterpret_cast(jhandle); - rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr); } ///////////////////////////////////////////////////////////////////// @@ -4047,12 +4106,12 @@ void Java_org_rocksdb_ComparatorOptions_disposeInternal( /* * Class: org_rocksdb_FlushOptions * Method: newFlushOptions - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_FlushOptions_newFlushOptions( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_FlushOptions_newFlushOptions( + JNIEnv* env, jclass jcls) { auto flush_opt = new rocksdb::FlushOptions(); - rocksdb::FlushOptionsJni::setHandle(env, jobj, flush_opt); + return reinterpret_cast(flush_opt); } /* @@ -4085,5 +4144,4 @@ jboolean Java_org_rocksdb_FlushOptions_waitForFlush( void Java_org_rocksdb_FlushOptions_disposeInternal( JNIEnv * env, jobject jobj, jlong jhandle) { delete reinterpret_cast(jhandle); - rocksdb::FlushOptionsJni::setHandle(env, jobj, nullptr); } diff --git a/external/rocksdb/java/rocksjni/portal.h b/external/rocksdb/java/rocksjni/portal.h index 804bbc68a1..4d7e502174 100644 --- a/external/rocksdb/java/rocksjni/portal.h +++ b/external/rocksdb/java/rocksjni/portal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,6 +24,11 @@ #include "rocksjni/loggerjnicallback.h" #include "rocksjni/writebatchhandlerjnicallback.h" +// Remove macro on windows +#ifdef DELETE +#undef DELETE +#endif + namespace rocksdb { // Detect if jlong overflows size_t @@ -44,27 +49,25 @@ template class RocksDBNativeClass { assert(jclazz != nullptr); return jclazz; } +}; - // Get the field id of the member variable to store - // the ptr - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - DERIVED::getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; - } +// Native class template for sub-classes of RocksMutableObject +template class NativeRocksMutableObject + : public RocksDBNativeClass { + public: - // Get the pointer from Java - static PTR getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); + static jmethodID getSetNativeHandleMethod(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + DERIVED::getJClass(env), "setNativeHandle", "(JZ)V"); + assert(mid != nullptr); + return mid; } // Pass the pointer to the java side. - static void setHandle(JNIEnv* env, jobject jdb, PTR ptr) { - env->SetLongField( - jdb, getHandleFieldID(env), - reinterpret_cast(ptr)); + static void setHandle(JNIEnv* env, jobject jobj, PTR ptr, + jboolean java_owns_handle) { + env->CallVoidMethod(jobj, getSetNativeHandleMethod(env), + reinterpret_cast(ptr), java_owns_handle); } }; @@ -402,7 +405,7 @@ class AbstractComparatorJni : public RocksDBNativeClass< }; // The portal class for org.rocksdb.AbstractSlice -class AbstractSliceJni : public RocksDBNativeClass< +class AbstractSliceJni : public NativeRocksMutableObject< const rocksdb::Slice*, AbstractSliceJni> { public: static jclass getJClass(JNIEnv* env) { @@ -644,67 +647,6 @@ class WriteEntryJni { assert(jclazz != nullptr); return jclazz; } - - static void setWriteType(JNIEnv* env, jobject jwrite_entry, - WriteType write_type) { - jobject jwrite_type; - switch (write_type) { - case kPutRecord: - jwrite_type = WriteTypeJni::PUT(env); - break; - - case kMergeRecord: - jwrite_type = WriteTypeJni::MERGE(env); - break; - - case kDeleteRecord: - jwrite_type = WriteTypeJni::DELETE(env); - break; - - case kLogDataRecord: - jwrite_type = WriteTypeJni::LOG(env); - break; - - default: - jwrite_type = nullptr; - } - assert(jwrite_type != nullptr); - env->SetObjectField(jwrite_entry, getWriteTypeField(env), jwrite_type); - } - - static void setKey(JNIEnv* env, jobject jwrite_entry, - const rocksdb::Slice* slice) { - jobject jkey = env->GetObjectField(jwrite_entry, getKeyField(env)); - AbstractSliceJni::setHandle(env, jkey, slice); - } - - static void setValue(JNIEnv* env, jobject jwrite_entry, - const rocksdb::Slice* slice) { - jobject jvalue = env->GetObjectField(jwrite_entry, getValueField(env)); - AbstractSliceJni::setHandle(env, jvalue, slice); - } - - private: - static jfieldID getWriteTypeField(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "type", "Lorg/rocksdb/WBWIRocksIterator$WriteType;"); - assert(fid != nullptr); - return fid; - } - - static jfieldID getKeyField(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "key", "Lorg/rocksdb/DirectSlice;"); - assert(fid != nullptr); - return fid; - } - - static jfieldID getValueField(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "value", "Lorg/rocksdb/DirectSlice;"); - assert(fid != nullptr); - return fid; - } }; class InfoLogLevelJni { @@ -734,15 +676,20 @@ class InfoLogLevelJni { return getEnum(env, "FATAL_LEVEL"); } + // Get the HEADER_LEVEL enum field of org.rocksdb.InfoLogLevel + static jobject HEADER_LEVEL(JNIEnv* env) { + return getEnum(env, "HEADER_LEVEL"); + } + private: - // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteType. + // Get the java class id of org.rocksdb.InfoLogLevel static jclass getJClass(JNIEnv* env) { jclass jclazz = env->FindClass("org/rocksdb/InfoLogLevel"); assert(jclazz != nullptr); return jclazz; } - // Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType + // Get an enum field of org.rocksdb.InfoLogLevel static jobject getEnum(JNIEnv* env, const char name[]) { jclass jclazz = getJClass(env); jfieldID jfid = diff --git a/external/rocksdb/java/rocksjni/ratelimiterjni.cc b/external/rocksdb/java/rocksjni/ratelimiterjni.cc index ab6160e0d3..7b4bc1f224 100644 --- a/external/rocksdb/java/rocksjni/ratelimiterjni.cc +++ b/external/rocksdb/java/rocksjni/ratelimiterjni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc b/external/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc index e442d8dafa..ef17efeec6 100644 --- a/external/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc +++ b/external/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,16 +12,13 @@ /* * Class: org_rocksdb_RemoveEmptyValueCompactionFilter * Method: createNewRemoveEmptyValueCompactionFilter0 - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_RemoveEmptyValueCompactionFilter_createNewRemoveEmptyValueCompactionFilter0( - JNIEnv* env, jobject jobj) { - const rocksdb::RemoveEmptyValueCompactionFilter* compaction_filter = +jlong Java_org_rocksdb_RemoveEmptyValueCompactionFilter_createNewRemoveEmptyValueCompactionFilter0( + JNIEnv* env, jclass jcls) { + auto* compaction_filter = new rocksdb::RemoveEmptyValueCompactionFilter(); // set the native handle to our native compaction filter - static jclass jclazz = - env->FindClass("org/rocksdb/RemoveEmptyValueCompactionFilter"); - static jfieldID fid = env->GetFieldID(jclazz, "nativeHandle_", "J"); - env->SetLongField(jobj, fid, reinterpret_cast(compaction_filter)); + return reinterpret_cast(compaction_filter); } diff --git a/external/rocksdb/java/rocksjni/restorejni.cc b/external/rocksdb/java/rocksjni/restorejni.cc index a2341632ba..154a9b5f12 100644 --- a/external/rocksdb/java/rocksjni/restorejni.cc +++ b/external/rocksdb/java/rocksjni/restorejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,7 +13,6 @@ #include #include "include/org_rocksdb_RestoreOptions.h" -#include "include/org_rocksdb_RestoreBackupableDB.h" #include "rocksjni/portal.h" #include "rocksdb/utilities/backupable_db.h" /* @@ -22,182 +21,19 @@ * Signature: (Z)J */ jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions(JNIEnv* env, - jobject jobj, jboolean keep_log_files) { + jclass jcls, jboolean keep_log_files) { auto ropt = new rocksdb::RestoreOptions(keep_log_files); return reinterpret_cast(ropt); } /* * Class: org_rocksdb_RestoreOptions - * Method: dispose + * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RestoreOptions_dispose(JNIEnv* env, jobject jobj, +void Java_org_rocksdb_RestoreOptions_disposeInternal(JNIEnv* env, jobject jobj, jlong jhandle) { auto ropt = reinterpret_cast(jhandle); assert(ropt); delete ropt; } - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: newRestoreBackupableDB - * Signature: (J)J - */ -jlong Java_org_rocksdb_RestoreBackupableDB_newRestoreBackupableDB(JNIEnv* env, - jobject jobj, jlong jopt_handle) { - auto opt = reinterpret_cast(jopt_handle); - auto rdb = new rocksdb::RestoreBackupableDB(rocksdb::Env::Default(), *opt); - return reinterpret_cast(rdb); -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: restoreDBFromBackup0 - * Signature: (JJLjava/lang/String;Ljava/lang/String;J)V - */ -void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromBackup0(JNIEnv* env, - jobject jobj, jlong jhandle, jlong jbackup_id, jstring jdb_dir, - jstring jwal_dir, jlong jopt_handle) { - auto opt = reinterpret_cast(jopt_handle); - - const char* cdb_dir = env->GetStringUTFChars(jdb_dir, 0); - const char* cwal_dir = env->GetStringUTFChars(jwal_dir, 0); - - auto rdb = reinterpret_cast(jhandle); - rocksdb::Status s = rdb->RestoreDBFromBackup( - static_cast(jbackup_id), cdb_dir, cwal_dir, *opt); - - env->ReleaseStringUTFChars(jdb_dir, cdb_dir); - env->ReleaseStringUTFChars(jwal_dir, cwal_dir); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: restoreDBFromLatestBackup0 - * Signature: (JLjava/lang/String;Ljava/lang/String;J)V - */ -void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromLatestBackup0( - JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_dir, jstring jwal_dir, - jlong jopt_handle) { - auto opt = reinterpret_cast(jopt_handle); - - const char* cdb_dir = env->GetStringUTFChars(jdb_dir, 0); - const char* cwal_dir = env->GetStringUTFChars(jwal_dir, 0); - - auto rdb = reinterpret_cast(jhandle); - rocksdb::Status s = - rdb->RestoreDBFromLatestBackup(cdb_dir, cwal_dir, *opt); - - env->ReleaseStringUTFChars(jdb_dir, cdb_dir); - env->ReleaseStringUTFChars(jwal_dir, cwal_dir); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: purgeOldBackups0 - * Signature: (JI)V - */ -void Java_org_rocksdb_RestoreBackupableDB_purgeOldBackups0(JNIEnv* env, - jobject jobj, jlong jhandle, jint jnum_backups_to_keep) { - auto rdb = reinterpret_cast(jhandle); - rocksdb::Status s = rdb->PurgeOldBackups(jnum_backups_to_keep); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: deleteBackup0 - * Signature: (JI)V - */ -void Java_org_rocksdb_RestoreBackupableDB_deleteBackup0(JNIEnv* env, - jobject jobj, jlong jhandle, jint jbackup_id) { - auto rdb = reinterpret_cast(jhandle); - rocksdb::Status s = rdb->DeleteBackup(jbackup_id); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: getBackupInfo - * Signature: (J)Ljava/util/List; - */ -jobject Java_org_rocksdb_RestoreBackupableDB_getBackupInfo( - JNIEnv* env, jobject jbdb, jlong jhandle) { - std::vector backup_infos; - reinterpret_cast(jhandle)-> - GetBackupInfo(&backup_infos); - return rocksdb::BackupInfoListJni::getBackupInfo(env, - backup_infos); -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: getCorruptedBackups - * Signature: (J)[I; - */ -jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups( - JNIEnv* env, jobject jbdb, jlong jhandle) { - std::vector backup_ids; - reinterpret_cast(jhandle)-> - GetCorruptedBackups(&backup_ids); - // store backupids in int array - const std::vector::size_type - kIdSize = backup_ids.size(); - - int int_backup_ids[kIdSize]; - for (std::vector::size_type i = 0; - i != kIdSize; i++) { - int_backup_ids[i] = backup_ids[i]; - } - // Store ints in java array - jintArray ret_backup_ids; - // Its ok to loose precision here (64->32) - jsize ret_backup_ids_size = static_cast(kIdSize); - ret_backup_ids = env->NewIntArray(ret_backup_ids_size); - env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, - int_backup_ids); - return ret_backup_ids; -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: garbageCollect - * Signature: (J)V - */ -void Java_org_rocksdb_RestoreBackupableDB_garbageCollect( - JNIEnv* env, jobject jobj, jlong jhandle) { - auto db = reinterpret_cast( - jhandle); - rocksdb::Status s = db->GarbageCollect(); - - if (!s.ok()) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - } -} - -/* - * Class: org_rocksdb_RestoreBackupableDB - * Method: dispose - * Signature: (J)V - */ -void Java_org_rocksdb_RestoreBackupableDB_dispose(JNIEnv* env, jobject jobj, - jlong jhandle) { - auto ropt = reinterpret_cast(jhandle); - assert(ropt); - delete ropt; -} diff --git a/external/rocksdb/java/rocksjni/rocksjni.cc b/external/rocksdb/java/rocksjni/rocksjni.cc index 221e7fff2d..4c68178636 100644 --- a/external/rocksdb/java/rocksjni/rocksjni.cc +++ b/external/rocksdb/java/rocksjni/rocksjni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include "include/org_rocksdb_RocksDB.h" #include "rocksdb/db.h" @@ -19,219 +21,149 @@ #include "rocksdb/types.h" #include "rocksjni/portal.h" +#ifdef min +#undef min +#endif + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Open - -/* - * Class: org_rocksdb_RocksDB - * Method: open - * Signature: (JLjava/lang/String;)V - */ -void Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2( - JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) { - auto opt = reinterpret_cast(jopt_handle); +jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path, + std::function open_fn + ) { + auto* opt = reinterpret_cast(jopt_handle); rocksdb::DB* db = nullptr; - const char* db_path = env->GetStringUTFChars(jdb_path, 0); - rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db); + const char* db_path = env->GetStringUTFChars(jdb_path, NULL); + rocksdb::Status s = open_fn(*opt, db_path, &db); env->ReleaseStringUTFChars(jdb_path, db_path); if (s.ok()) { - rocksdb::RocksDBJni::setHandle(env, jdb, db); - return; + return reinterpret_cast(db); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } /* * Class: org_rocksdb_RocksDB - * Method: openROnly - * Signature: (JLjava/lang/String;)V + * Method: open + * Signature: (JLjava/lang/String;)J */ -void Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2( - JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) { - auto opt = reinterpret_cast(jopt_handle); - rocksdb::DB* db = nullptr; - const char* db_path = env->GetStringUTFChars(jdb_path, 0); - rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt, - db_path, &db); - env->ReleaseStringUTFChars(jdb_path, db_path); - - if (s.ok()) { - rocksdb::RocksDBJni::setHandle(env, jdb, db); - return; - } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2( + JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path) { + return rocksdb_open_helper(env, jopt_handle, jdb_path, + (rocksdb::Status(*) + (const rocksdb::Options&, const std::string&, rocksdb::DB**) + )&rocksdb::DB::Open + ); } /* * Class: org_rocksdb_RocksDB * Method: openROnly - * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List; + * Signature: (JLjava/lang/String;)J */ -jobject - Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Ljava_util_List_2I( - JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path, - jobject jcfdesc_list, jint jcfdesc_count) { - auto opt = reinterpret_cast(jopt_handle); - rocksdb::DB* db = nullptr; - const char* db_path = env->GetStringUTFChars(jdb_path, 0); - - std::vector cfnames_to_free; - std::vector jcfnames_for_free; +jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2( + JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path) { + return rocksdb_open_helper(env, jopt_handle, jdb_path, []( + const rocksdb::Options& options, + const std::string& db_path, rocksdb::DB** db) { + return rocksdb::DB::OpenForReadOnly(options, db_path, db); + }); +} + +jlongArray rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, + jstring jdb_path, jobjectArray jcolumn_names, jlongArray jcolumn_options, + std::function&, + std::vector*, + rocksdb::DB**)> open_fn + ) { + auto* opt = reinterpret_cast(jopt_handle); + const char* db_path = env->GetStringUTFChars(jdb_path, NULL); std::vector column_families; - std::vector handles; - // get iterator for ColumnFamilyDescriptors - jobject iteratorObj = env->CallObjectMethod( - jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env)); - - // iterate over ColumnFamilyDescriptors - while (env->CallBooleanMethod( - iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - // get ColumnFamilyDescriptor - jobject jcf_descriptor = env->CallObjectMethod(iteratorObj, - rocksdb::ListJni::getNextMethod(env)); - // get ColumnFamilyName - jbyteArray cf_name_in_byte_array = static_cast( - env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( - env))); - // get CF Options - jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( - env)); - rocksdb::ColumnFamilyOptions* cfOptions = - rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); - - jbyte* cfname = env->GetByteArrayElements(cf_name_in_byte_array, 0); - const int len = env->GetArrayLength(cf_name_in_byte_array); - - // free allocated cfnames after call to open - cfnames_to_free.push_back(cfname); - jcfnames_for_free.push_back(cf_name_in_byte_array); - column_families.push_back(rocksdb::ColumnFamilyDescriptor( - std::string(reinterpret_cast(cfname), len), *cfOptions)); - } - rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt, - db_path, column_families, &handles, &db); - env->ReleaseStringUTFChars(jdb_path, db_path); - // free jbyte allocations - for (std::vector::size_type i = 0; - i != cfnames_to_free.size(); i++) { - // free cfnames - env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0); + jsize len_cols = env->GetArrayLength(jcolumn_names); + jlong* jco = env->GetLongArrayElements(jcolumn_options, NULL); + for(int i = 0; i < len_cols; i++) { + jobject jcn = env->GetObjectArrayElement(jcolumn_names, i); + jbyteArray jcn_ba = reinterpret_cast(jcn); + jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, NULL); + const int jcf_name_len = env->GetArrayLength(jcn_ba); + + //TODO(AR) do I need to make a copy of jco[i] ? + + std::string cf_name (reinterpret_cast(jcf_name), jcf_name_len); + rocksdb::ColumnFamilyOptions* cf_options = + reinterpret_cast(jco[i]); + column_families.push_back( + rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options)); + + env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT); + env->DeleteLocalRef(jcn); } + env->ReleaseLongArrayElements(jcolumn_options, jco, JNI_ABORT); + + std::vector handles; + rocksdb::DB* db = nullptr; + rocksdb::Status s = open_fn(*opt, db_path, column_families, + &handles, &db); // check if open operation was successful if (s.ok()) { - rocksdb::RocksDBJni::setHandle(env, jdb, db); - jclass jListClazz = env->FindClass("java/util/ArrayList"); - jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( - env, jListClazz); - jobject jcfhandle_list = env->NewObject(jListClazz, - midList, handles.size()); - // insert in java list - for (std::vector::size_type i = 0; - i != handles.size(); i++) { - // jlong must be converted to Long due to collections restrictions - jclass jLongClazz = env->FindClass("java/lang/Long"); - jmethodID midLong = env->GetMethodID(jLongClazz, "", "(J)V"); - jobject obj = env->NewObject(jLongClazz, midLong, - reinterpret_cast(handles[i])); - env->CallBooleanMethod(jcfhandle_list, - rocksdb::ListJni::getListAddMethodId(env), obj); + jsize resultsLen = 1 + len_cols; //db handle + column family handles + std::unique_ptr results = + std::unique_ptr(new jlong[resultsLen]); + results[0] = reinterpret_cast(db); + for(int i = 1; i <= len_cols; i++) { + results[i] = reinterpret_cast(handles[i - 1]); } - return jcfhandle_list; + jlongArray jresults = env->NewLongArray(resultsLen); + env->SetLongArrayRegion(jresults, 0, resultsLen, results.get()); + return jresults; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return NULL; } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - return nullptr; } /* * Class: org_rocksdb_RocksDB - * Method: open - * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List; + * Method: openROnly + * Signature: (JLjava/lang/String;[[B[J)[J */ -jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I( - JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path, - jobject jcfdesc_list, jint jcfdesc_count) { - auto opt = reinterpret_cast(jopt_handle); - rocksdb::DB* db = nullptr; - const char* db_path = env->GetStringUTFChars(jdb_path, 0); - - std::vector cfnames_to_free; - std::vector jcfnames_for_free; - - std::vector column_families; - std::vector handles; - // get iterator for ColumnFamilyDescriptors - jobject iteratorObj = env->CallObjectMethod( - jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env)); - - // iterate over ColumnFamilyDescriptors - while (env->CallBooleanMethod( - iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - // get ColumnFamilyDescriptor - jobject jcf_descriptor = env->CallObjectMethod(iteratorObj, - rocksdb::ListJni::getNextMethod(env)); - // get ColumnFamilyName - jbyteArray cf_name_in_byte_array = static_cast( - env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( - env))); - // get CF Options - jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( - env)); - rocksdb::ColumnFamilyOptions* cfOptions = - rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); - - jbyte* cfname = env->GetByteArrayElements(cf_name_in_byte_array, 0); - const int len = env->GetArrayLength(cf_name_in_byte_array); - - // free allocated cfnames after call to open - cfnames_to_free.push_back(cfname); - jcfnames_for_free.push_back(cf_name_in_byte_array); - column_families.push_back(rocksdb::ColumnFamilyDescriptor( - std::string(reinterpret_cast(cfname), len), *cfOptions)); - } - - rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families, - &handles, &db); - env->ReleaseStringUTFChars(jdb_path, db_path); - // free jbyte allocations - for (std::vector::size_type i = 0; - i != cfnames_to_free.size(); i++) { - // free cfnames - env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0); - } - - // check if open operation was successful - if (s.ok()) { - rocksdb::RocksDBJni::setHandle(env, jdb, db); - jclass jListClazz = env->FindClass("java/util/ArrayList"); - jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( - env, jListClazz); - jobject jcfhandle_list = env->NewObject(jListClazz, - midList, handles.size()); - // insert in java list - for (std::vector::size_type i = 0; - i != handles.size(); i++) { - // jlong must be converted to Long due to collections restrictions - jclass jLongClazz = env->FindClass("java/lang/Long"); - jmethodID midLong = env->GetMethodID(jLongClazz, "", "(J)V"); - jobject obj = env->NewObject(jLongClazz, midLong, - reinterpret_cast(handles[i])); - env->CallBooleanMethod(jcfhandle_list, - rocksdb::ListJni::getListAddMethodId(env), obj); - } +jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3J( + JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path, + jobjectArray jcolumn_names, jlongArray jcolumn_options) { + return rocksdb_open_helper(env, jopt_handle, jdb_path, jcolumn_names, + jcolumn_options, []( + const rocksdb::DBOptions& options, const std::string& db_path, + const std::vector& column_families, + std::vector* handles, rocksdb::DB** db) { + return rocksdb::DB::OpenForReadOnly(options, db_path, column_families, + handles, db); + }); +} - return jcfhandle_list; - } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - return nullptr; +/* + * Class: org_rocksdb_RocksDB + * Method: open + * Signature: (JLjava/lang/String;[[B[J)[J + */ +jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J( + JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path, + jobjectArray jcolumn_names, jlongArray jcolumn_options) { + return rocksdb_open_helper(env, jopt_handle, jdb_path, jcolumn_names, + jcolumn_options, (rocksdb::Status(*) + (const rocksdb::DBOptions&, const std::string&, + const std::vector&, + std::vector*, rocksdb::DB**) + )&rocksdb::DB::Open + ); } ////////////////////////////////////////////////////////////////////////////// @@ -240,25 +172,21 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I( /* * Class: org_rocksdb_RocksDB * Method: listColumnFamilies - * Signature: (JLjava/lang/String;)Ljava/util/List; + * Signature: (JLjava/lang/String;)[[B */ -jobject Java_org_rocksdb_RocksDB_listColumnFamilies( +jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies( JNIEnv* env, jclass jclazz, jlong jopt_handle, jstring jdb_path) { std::vector column_family_names; - auto opt = reinterpret_cast(jopt_handle); + auto* opt = reinterpret_cast(jopt_handle); const char* db_path = env->GetStringUTFChars(jdb_path, 0); - jobject jvalue_list = nullptr; - rocksdb::Status s = rocksdb::DB::ListColumnFamilies(*opt, db_path, &column_family_names); env->ReleaseStringUTFChars(jdb_path, db_path); - if (s.ok()) { - // Don't reuse class pointer - jclass jListClazz = env->FindClass("java/util/ArrayList"); - jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(env, - jListClazz); - jvalue_list = env->NewObject(jListClazz, mid, column_family_names.size()); + jclass jcls_ba = env->FindClass("[B"); + jobjectArray jresults = env->NewObjectArray( + static_cast(column_family_names.size()), jcls_ba, NULL); + if (s.ok()) { for (std::vector::size_type i = 0; i < column_family_names.size(); i++) { jbyteArray jcf_value = @@ -266,11 +194,11 @@ jobject Java_org_rocksdb_RocksDB_listColumnFamilies( env->SetByteArrayRegion( jcf_value, 0, static_cast(column_family_names[i].size()), reinterpret_cast(column_family_names[i].data())); - env->CallBooleanMethod(jvalue_list, - rocksdb::ListJni::getListAddMethodId(env), jcf_value); + env->SetObjectArrayElement(jresults, static_cast(i), jcf_value); + env->DeleteLocalRef(jcf_value); } } - return jvalue_list; + return jresults; } ////////////////////////////////////////////////////////////////////////////// @@ -393,12 +321,12 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BIJ( /* * Class: org_rocksdb_RocksDB * Method: write0 - * Signature: (JJ)V + * Signature: (JJJ)V */ void Java_org_rocksdb_RocksDB_write0( - JNIEnv* env, jobject jdb, + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options_handle, jlong jwb_handle) { - rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast( jwrite_options_handle); auto* wb = reinterpret_cast(jwb_handle); @@ -413,12 +341,12 @@ void Java_org_rocksdb_RocksDB_write0( /* * Class: org_rocksdb_RocksDB * Method: write1 - * Signature: (JJ)V + * Signature: (JJJ)V */ void Java_org_rocksdb_RocksDB_write1( - JNIEnv* env, jobject jdb, + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options_handle, jlong jwbwi_handle) { - rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast( jwrite_options_handle); auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -465,12 +393,12 @@ jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db, /* * Class: org_rocksdb_RocksDB * Method: keyMayExist - * Signature: ([BILjava/lang/StringBuffer;)Z + * Signature: (J[BILjava/lang/StringBuffer;)Z */ -jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2( - JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len, +jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_len, jobject jstring_buffer) { - rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto* db = reinterpret_cast(jdb_handle); return key_may_exist_helper(env, db, rocksdb::ReadOptions(), nullptr, jkey, jkey_len, jstring_buffer); } @@ -478,13 +406,13 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2( /* * Class: org_rocksdb_RocksDB * Method: keyMayExist - * Signature: ([BIJLjava/lang/StringBuffer;)Z + * Signature: (J[BIJLjava/lang/StringBuffer;)Z */ -jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2( - JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len, +jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jstring_buffer) { - rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); - auto cf_handle = reinterpret_cast( + auto* db = reinterpret_cast(jdb_handle); + auto* cf_handle = reinterpret_cast( jcf_handle); if (cf_handle != nullptr) { return key_may_exist_helper(env, db, rocksdb::ReadOptions(), @@ -492,19 +420,19 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2( } else { rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + return true; } - return true; } /* * Class: org_rocksdb_RocksDB * Method: keyMayExist - * Signature: (J[BILjava/lang/StringBuffer;)Z + * Signature: (JJ[BILjava/lang/StringBuffer;)Z */ -jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2( - JNIEnv* env, jobject jdb, jlong jread_options_handle, +jboolean Java_org_rocksdb_RocksDB_keyMayExist__JJ_3BILjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jread_options_handle, jbyteArray jkey, jint jkey_len, jobject jstring_buffer) { - rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto* db = reinterpret_cast(jdb_handle); auto& read_options = *reinterpret_cast( jread_options_handle); return key_may_exist_helper(env, db, read_options, @@ -514,15 +442,15 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2( /* * Class: org_rocksdb_RocksDB * Method: keyMayExist - * Signature: (J[BIJLjava/lang/StringBuffer;)Z + * Signature: (JJ[BIJLjava/lang/StringBuffer;)Z */ -jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2( - JNIEnv* env, jobject jdb, jlong jread_options_handle, +jboolean Java_org_rocksdb_RocksDB_keyMayExist__JJ_3BIJLjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jread_options_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jstring_buffer) { - rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto* db = reinterpret_cast(jdb_handle); auto& read_options = *reinterpret_cast( jread_options_handle); - auto cf_handle = reinterpret_cast( + auto* cf_handle = reinterpret_cast( jcf_handle); if (cf_handle != nullptr) { return key_may_exist_helper(env, db, read_options, cf_handle, @@ -530,8 +458,8 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2( } else { rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + return true; } - return true; } ////////////////////////////////////////////////////////////////////////////// @@ -688,8 +616,8 @@ jint rocksdb_get_helper( return kStatusError; } - int cvalue_len = static_cast(cvalue.size()); - int length = std::min(jentry_value_len, cvalue_len); + jint cvalue_len = static_cast(cvalue.size()); + jint length = std::min(jentry_value_len, cvalue_len); env->SetByteArrayRegion( jentry_value, 0, length, @@ -698,49 +626,38 @@ jint rocksdb_get_helper( } // cf multi get -jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, - const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count, - jobject jcfhandle_list) { - std::vector keys; - std::vector keys_to_free; +jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, + const rocksdb::ReadOptions& rOpt, jobjectArray jkeys, + jlongArray jcolumn_family_handles) { std::vector cf_handles; - - if (jcfhandle_list != nullptr) { - // get cf iterator - jobject cfIteratorObj = env->CallObjectMethod( - jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env)); - - // iterate over keys and convert java byte array to slice - while (env->CallBooleanMethod( - cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - jobject jobj = (jbyteArray) env->CallObjectMethod( - cfIteratorObj, rocksdb::ListJni::getNextMethod(env)); - rocksdb::ColumnFamilyHandle* cfHandle = - rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj); - cf_handles.push_back(cfHandle); + if (jcolumn_family_handles != nullptr) { + jsize len_cols = env->GetArrayLength(jcolumn_family_handles); + jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, NULL); + for (int i = 0; i < len_cols; i++) { + auto* cf_handle = + reinterpret_cast(jcfh[i]); + cf_handles.push_back(cf_handle); } + env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT); + } + + std::vector keys; + std::vector> keys_to_free; + jsize len_keys = env->GetArrayLength(jkeys); + if(env->EnsureLocalCapacity(len_keys) != 0) { + // out of memory + return NULL; } + for (int i = 0; i < len_keys; i++) { + jobject jk = env->GetObjectArrayElement(jkeys, i); + jbyteArray jk_ba = reinterpret_cast(jk); + jsize len_key = env->GetArrayLength(jk_ba); + jbyte* jk_val = env->GetByteArrayElements(jk_ba, NULL); - // Process key list - // get iterator - jobject iteratorObj = env->CallObjectMethod( - jkey_list, rocksdb::ListJni::getIteratorMethod(env)); - - // iterate over keys and convert java byte array to slice - while (env->CallBooleanMethod( - iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - jbyteArray jkey = (jbyteArray) env->CallObjectMethod( - iteratorObj, rocksdb::ListJni::getNextMethod(env)); - jint key_length = env->GetArrayLength(jkey); - - jbyte* key = new jbyte[key_length]; - env->GetByteArrayRegion(jkey, 0, key_length, key); - // store allocated jbyte to free it after multiGet call - keys_to_free.push_back(key); - - rocksdb::Slice key_slice( - reinterpret_cast(key), key_length); + rocksdb::Slice key_slice(reinterpret_cast(jk_val), len_key); keys.push_back(key_slice); + + keys_to_free.push_back(std::make_tuple(jk_ba, jk_val, jk)); } std::vector values; @@ -751,13 +668,23 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, s = db->MultiGet(rOpt, cf_handles, keys, &values); } - // Don't reuse class pointer - jclass jclazz = env->FindClass("java/util/ArrayList"); - jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId( - env, jclazz); - jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count); + // free up allocated byte arrays + for (std::vector>::size_type i = 0; + i < keys_to_free.size(); i++) { + jobject jk; + jbyteArray jk_ba; + jbyte* jk_val; + std::tie(jk_ba, jk_val, jk) = keys_to_free[i]; + env->ReleaseByteArrayElements(jk_ba, jk_val, JNI_ABORT); + env->DeleteLocalRef(jk); + } - // insert in java list + // prepare the results + jclass jcls_ba = env->FindClass("[B"); + jobjectArray jresults = + env->NewObjectArray(static_cast(s.size()), jcls_ba, NULL); + + // add to the jresults for (std::vector::size_type i = 0; i != s.size(); i++) { if (s[i].ok()) { jbyteArray jentry_value = @@ -765,73 +692,60 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, env->SetByteArrayRegion( jentry_value, 0, static_cast(values[i].size()), reinterpret_cast(values[i].c_str())); - env->CallBooleanMethod( - jvalue_list, rocksdb::ListJni::getListAddMethodId(env), - jentry_value); - } else { - env->CallBooleanMethod( - jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr); + env->SetObjectArrayElement(jresults, static_cast(i), jentry_value); + env->DeleteLocalRef(jentry_value); } } - // free up allocated byte arrays - for (std::vector::size_type i = 0; i != keys_to_free.size(); i++) { - delete[] keys_to_free[i]; - } - keys_to_free.clear(); - return jvalue_list; + + return jresults; } /* * Class: org_rocksdb_RocksDB * Method: multiGet - * Signature: (JLjava/util/List;I)Ljava/util/List; + * Signature: (J[[B)[[B */ -jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I( - JNIEnv* env, jobject jdb, jlong jdb_handle, - jobject jkey_list, jint jkeys_count) { +jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B( + JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), - rocksdb::ReadOptions(), jkey_list, jkeys_count, nullptr); + rocksdb::ReadOptions(), jkeys, nullptr); } /* * Class: org_rocksdb_RocksDB * Method: multiGet - * Signature: (JLjava/util/List;ILjava/util/List;)Ljava/util/List; + * Signature: (J[[B[J)[[B */ -jobject - Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2ILjava_util_List_2( - JNIEnv* env, jobject jdb, jlong jdb_handle, - jobject jkey_list, jint jkeys_count, jobject jcfhandle_list) { +jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3J( + JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys, + jlongArray jcolumn_family_handles) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), - rocksdb::ReadOptions(), jkey_list, jkeys_count, jcfhandle_list); + rocksdb::ReadOptions(), jkeys, jcolumn_family_handles); } /* * Class: org_rocksdb_RocksDB * Method: multiGet - * Signature: (JJLjava/util/List;I)Ljava/util/List; + * Signature: (JJ[[B)[[B */ -jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I( - JNIEnv* env, jobject jdb, jlong jdb_handle, - jlong jropt_handle, jobject jkey_list, jint jkeys_count) { +jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jobjectArray jkeys) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), jkey_list, - jkeys_count, nullptr); + *reinterpret_cast(jropt_handle), jkeys, nullptr); } /* * Class: org_rocksdb_RocksDB * Method: multiGet - * Signature: (JJLjava/util/List;ILjava/util/List;)Ljava/util/List; + * Signature: (JJ[[B[J)[[B */ -jobject - Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2ILjava_util_List_2( - JNIEnv* env, jobject jdb, jlong jdb_handle, - jlong jropt_handle, jobject jkey_list, jint jkeys_count, - jobject jcfhandle_list) { +jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3J( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jobjectArray jkeys, jlongArray jcolumn_family_handles) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), jkey_list, - jkeys_count, jcfhandle_list); + *reinterpret_cast(jropt_handle), jkeys, + jcolumn_family_handles); } /* @@ -1199,47 +1113,42 @@ jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ( /* * Class: org_rocksdb_RocksDB * Method: iterators - * Signature: (JLjava/util/List;J)[J + * Signature: (J[JJ)[J */ jlongArray Java_org_rocksdb_RocksDB_iterators( - JNIEnv* env, jobject jdb, jlong db_handle, jobject jcfhandle_list, - jlong jread_options_handle) { - auto db = reinterpret_cast(db_handle); + JNIEnv* env, jobject jdb, jlong db_handle, + jlongArray jcolumn_family_handles, jlong jread_options_handle) { + auto* db = reinterpret_cast(db_handle); auto& read_options = *reinterpret_cast( jread_options_handle); std::vector cf_handles; - std::vector iterators; - - if (jcfhandle_list != nullptr) { - // get cf iterator - jobject cfIteratorObj = env->CallObjectMethod( - jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env)); - - // iterate over keys and convert java byte array to slice - while (env->CallBooleanMethod( - cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - jobject jobj = (jbyteArray) env->CallObjectMethod( - cfIteratorObj, rocksdb::ListJni::getNextMethod(env)); - rocksdb::ColumnFamilyHandle* cfHandle = - rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj); - cf_handles.push_back(cfHandle); + if (jcolumn_family_handles != nullptr) { + jsize len_cols = env->GetArrayLength(jcolumn_family_handles); + jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, NULL); + for (int i = 0; i < len_cols; i++) { + auto* cf_handle = + reinterpret_cast(jcfh[i]); + cf_handles.push_back(cf_handle); } + env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT); } + std::vector iterators; rocksdb::Status s = db->NewIterators(read_options, cf_handles, &iterators); if (s.ok()) { jlongArray jLongArray = env->NewLongArray(static_cast(iterators.size())); - for (std::vector::size_type i = 0; i < iterators.size(); - i++) { + for (std::vector::size_type i = 0; + i < iterators.size(); i++) { env->SetLongArrayRegion(jLongArray, static_cast(i), 1, reinterpret_cast(&iterators[i])); } return jLongArray; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return NULL; } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - return env->NewLongArray(0); } /* @@ -1257,32 +1166,23 @@ jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily( /* * Class: org_rocksdb_RocksDB * Method: createColumnFamily - * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;)J; + * Signature: (J[BJ)J */ jlong Java_org_rocksdb_RocksDB_createColumnFamily( JNIEnv* env, jobject jdb, jlong jdb_handle, - jobject jcf_descriptor) { + jbyteArray jcolumn_name, jlong jcolumn_options) { rocksdb::ColumnFamilyHandle* handle; auto db_handle = reinterpret_cast(jdb_handle); - // get ColumnFamilyName - jbyteArray byteArray = static_cast(env->CallObjectMethod( - jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( - env))); - // get CF Options - jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( - env)); - rocksdb::ColumnFamilyOptions* cfOptions = - rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); - - jbyte* cfname = env->GetByteArrayElements(byteArray, 0); - const int len = env->GetArrayLength(byteArray); + jbyte* cfname = env->GetByteArrayElements(jcolumn_name, 0); + const int len = env->GetArrayLength(jcolumn_name); + + auto* cfOptions = + reinterpret_cast(jcolumn_options); rocksdb::Status s = db_handle->CreateColumnFamily( *cfOptions, std::string(reinterpret_cast(cfname), len), &handle); - env->ReleaseByteArrayElements(byteArray, cfname, 0); + env->ReleaseByteArrayElements(jcolumn_name, cfname, 0); if (s.ok()) { return reinterpret_cast(handle); @@ -1585,6 +1485,42 @@ void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ( jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id); } +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::PauseBackgroundWork + +/* + * Class: org_rocksdb_RocksDB + * Method: pauseBackgroundWork + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_pauseBackgroundWork( + JNIEnv* env, jobject jobj, jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + auto s = db->PauseBackgroundWork(); + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::ContinueBackgroundWork + +/* + * Class: org_rocksdb_RocksDB + * Method: continueBackgroundWork + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_continueBackgroundWork( + JNIEnv* env, jobject jobj, jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + auto s = db->ContinueBackgroundWork(); + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::GetLatestSequenceNumber diff --git a/external/rocksdb/java/rocksjni/slice.cc b/external/rocksdb/java/rocksjni/slice.cc index 8111173971..e5eb383bd6 100644 --- a/external/rocksdb/java/rocksjni/slice.cc +++ b/external/rocksdb/java/rocksjni/slice.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,12 +22,11 @@ /* * Class: org_rocksdb_AbstractSlice * Method: createNewSliceFromString - * Signature: (Ljava/lang/String;)V + * Signature: (Ljava/lang/String;)J */ -void Java_org_rocksdb_AbstractSlice_createNewSliceFromString( - JNIEnv* env, jobject jobj, jstring jstr) { - - const auto* str = env->GetStringUTFChars(jstr, 0); +jlong Java_org_rocksdb_AbstractSlice_createNewSliceFromString( + JNIEnv * env, jclass jcls, jstring jstr) { + const auto* str = env->GetStringUTFChars(jstr, NULL); const size_t len = strlen(str); char* buf = new char[len + 1]; memcpy(buf, str, len); @@ -35,7 +34,7 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString( env->ReleaseStringUTFChars(jstr, str); const auto* slice = new rocksdb::Slice(buf); - rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); + return reinterpret_cast(slice); } /* @@ -115,10 +114,10 @@ void Java_org_rocksdb_AbstractSlice_disposeInternal( /* * Class: org_rocksdb_Slice * Method: createNewSlice0 - * Signature: ([BI)V + * Signature: ([BI)J */ -void Java_org_rocksdb_Slice_createNewSlice0( - JNIEnv * env, jobject jobj, jbyteArray data, jint offset) { +jlong Java_org_rocksdb_Slice_createNewSlice0( + JNIEnv * env, jclass jcls, jbyteArray data, jint offset) { const jsize dataSize = env->GetArrayLength(data); const int len = dataSize - offset; @@ -126,32 +125,33 @@ void Java_org_rocksdb_Slice_createNewSlice0( env->GetByteArrayRegion(data, offset, len, ptrData); const auto* slice = new rocksdb::Slice((const char*)ptrData, len); - rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); + return reinterpret_cast(slice); } /* * Class: org_rocksdb_Slice * Method: createNewSlice1 - * Signature: ([B)V + * Signature: ([B)J */ -void Java_org_rocksdb_Slice_createNewSlice1( - JNIEnv * env, jobject jobj, jbyteArray data) { +jlong Java_org_rocksdb_Slice_createNewSlice1( + JNIEnv * env, jclass jcls, jbyteArray data) { const int len = env->GetArrayLength(data) + 1; jboolean isCopy; jbyte* ptrData = env->GetByteArrayElements(data, &isCopy); - char* buf = new char[len]; + // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method + char* buf = new char[len]; memcpy(buf, ptrData, len - 1); buf[len-1]='\0'; const auto* slice = new rocksdb::Slice(buf, len - 1); - rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT); - // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method + + return reinterpret_cast(slice); } /* @@ -187,27 +187,27 @@ void Java_org_rocksdb_Slice_disposeInternalBuf( /* * Class: org_rocksdb_DirectSlice * Method: createNewDirectSlice0 - * Signature: (Ljava/nio/ByteBuffer;I)V + * Signature: (Ljava/nio/ByteBuffer;I)J */ -void Java_org_rocksdb_DirectSlice_createNewDirectSlice0( - JNIEnv* env, jobject jobj, jobject data, jint length) { +jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice0( + JNIEnv* env, jclass jcls, jobject data, jint length) { const auto* ptrData = reinterpret_cast(env->GetDirectBufferAddress(data)); const auto* slice = new rocksdb::Slice(ptrData, length); - rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); + return reinterpret_cast(slice); } /* * Class: org_rocksdb_DirectSlice * Method: createNewDirectSlice1 - * Signature: (Ljava/nio/ByteBuffer;)V + * Signature: (Ljava/nio/ByteBuffer;)J */ -void Java_org_rocksdb_DirectSlice_createNewDirectSlice1( - JNIEnv* env, jobject jobj, jobject data) { +jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice1( + JNIEnv* env, jclass jcls, jobject data) { const auto* ptrData = reinterpret_cast(env->GetDirectBufferAddress(data)); const auto* slice = new rocksdb::Slice(ptrData); - rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); + return reinterpret_cast(slice); } /* diff --git a/external/rocksdb/java/rocksjni/snapshot.cc b/external/rocksdb/java/rocksjni/snapshot.cc index cd10c97c81..fa8ede7abc 100644 --- a/external/rocksdb/java/rocksjni/snapshot.cc +++ b/external/rocksdb/java/rocksjni/snapshot.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/statistics.cc b/external/rocksdb/java/rocksjni/statistics.cc index bf170c6de4..c41ec72c0b 100644 --- a/external/rocksdb/java/rocksjni/statistics.cc +++ b/external/rocksdb/java/rocksjni/statistics.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,7 +20,7 @@ * Signature: (IJ)J */ jlong Java_org_rocksdb_Statistics_getTickerCount0( - JNIEnv* env, jobject jobj, int tickerType, jlong handle) { + JNIEnv* env, jobject jobj, jint tickerType, jlong handle) { auto st = reinterpret_cast(handle); assert(st != nullptr); @@ -29,11 +29,11 @@ jlong Java_org_rocksdb_Statistics_getTickerCount0( /* * Class: org_rocksdb_Statistics - * Method: geHistogramData0 + * Method: getHistogramData0 * Signature: (IJ)Lorg/rocksdb/HistogramData; */ -jobject Java_org_rocksdb_Statistics_geHistogramData0( - JNIEnv* env, jobject jobj, int histogramType, jlong handle) { +jobject Java_org_rocksdb_Statistics_getHistogramData0( + JNIEnv* env, jobject jobj, jint histogramType, jlong handle) { auto st = reinterpret_cast(handle); assert(st != nullptr); diff --git a/external/rocksdb/java/rocksjni/table.cc b/external/rocksdb/java/rocksjni/table.cc index e78e7e0d7d..204d1ba38f 100644 --- a/external/rocksdb/java/rocksjni/table.cc +++ b/external/rocksdb/java/rocksjni/table.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -38,13 +38,14 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZJIJIIZIZZJIBBI)J + * Signature: (ZJIJIIZIZZZJIBBI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation, jint block_restart_interval, jboolean whole_key_filtering, jlong jfilterPolicy, jboolean cache_index_and_filter_blocks, + jboolean pin_l0_filter_and_index_blocks_in_cache, jboolean hash_index_allow_collision, jlong block_cache_compressed_size, jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type, jbyte jindex_type, jint jformat_version) { @@ -70,6 +71,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.filter_policy = *pFilterPolicy; } options.cache_index_and_filter_blocks = cache_index_and_filter_blocks; + options.pin_l0_filter_and_index_blocks_in_cache = + pin_l0_filter_and_index_blocks_in_cache; options.hash_index_allow_collision = hash_index_allow_collision; if (block_cache_compressed_size > 0) { if (block_cache_compressd_num_shard_bits > 0) { diff --git a/external/rocksdb/java/rocksjni/transaction_log.cc b/external/rocksdb/java/rocksjni/transaction_log.cc index 1d3d7c100a..eed8d84b5a 100644 --- a/external/rocksdb/java/rocksjni/transaction_log.cc +++ b/external/rocksdb/java/rocksjni/transaction_log.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/ttl.cc b/external/rocksdb/java/rocksjni/ttl.cc index ec5b419f18..6b39252f1d 100644 --- a/external/rocksdb/java/rocksjni/ttl.cc +++ b/external/rocksdb/java/rocksjni/ttl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,7 @@ #include #include #include +#include #include "include/org_rocksdb_TtlDB.h" #include "rocksdb/utilities/db_ttl.h" @@ -20,10 +21,10 @@ /* * Class: org_rocksdb_TtlDB * Method: open - * Signature: (JLjava/lang/String;IZ)V + * Signature: (JLjava/lang/String;IZ)J */ -void Java_org_rocksdb_TtlDB_open(JNIEnv* env, - jobject jttldb, jlong joptions_handle, jstring jdb_path, +jlong Java_org_rocksdb_TtlDB_open(JNIEnv* env, + jclass jcls, jlong joptions_handle, jstring jdb_path, jint jttl, jboolean jread_only) { auto* opt = reinterpret_cast(joptions_handle); rocksdb::DBWithTTL* db = nullptr; @@ -35,145 +36,103 @@ void Java_org_rocksdb_TtlDB_open(JNIEnv* env, // as TTLDB extends RocksDB on the java side, we can reuse // the RocksDB portal here. if (s.ok()) { - rocksdb::RocksDBJni::setHandle(env, jttldb, db); - return; + return reinterpret_cast(db); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } /* * Class: org_rocksdb_TtlDB * Method: openCF - * Signature: (JLjava/lang/String;Ljava/util/List; - * ILjava/util/List;Z)Ljava/util/List; + * Signature: (JLjava/lang/String;[[B[J[IZ)[J */ -jobject +jlongArray Java_org_rocksdb_TtlDB_openCF( - JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path, - jobject jcfdesc_list, jint jcfdesc_count, jobject jttl_list, - jboolean jread_only) { - auto* opt = reinterpret_cast(jopt_handle); - rocksdb::DBWithTTL* db = nullptr; - const char* db_path = env->GetStringUTFChars(jdb_path, 0); - - std::vector cfnames_to_free; - std::vector jcfnames_for_free; + JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path, + jobjectArray jcolumn_names, jlongArray jcolumn_options, + jintArray jttls, jboolean jread_only) { + auto* opt = reinterpret_cast(jopt_handle); + const char* db_path = env->GetStringUTFChars(jdb_path, NULL); std::vector column_families; - std::vector ttl_values; - std::vector handles; - // get iterator for ColumnFamilyDescriptors - jobject iteratorObj = env->CallObjectMethod( - jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env)); - - // iterate over ColumnFamilyDescriptors - while (env->CallBooleanMethod( - iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - // get ColumnFamilyDescriptor - jobject jcf_descriptor = env->CallObjectMethod(iteratorObj, - rocksdb::ListJni::getNextMethod(env)); - // get ColumnFamilyName - jbyteArray byteArray = static_cast(env->CallObjectMethod( - jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( - env))); - // get CF Options - jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( - env)); - rocksdb::ColumnFamilyOptions* cfOptions = - rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); - - jbyte* cfname = env->GetByteArrayElements(byteArray, 0); - const int len = env->GetArrayLength(byteArray); - - // free allocated cfnames after call to open - cfnames_to_free.push_back(cfname); - jcfnames_for_free.push_back(byteArray); - column_families.push_back(rocksdb::ColumnFamilyDescriptor( - std::string(reinterpret_cast(cfname), len), *cfOptions)); + + jsize len_cols = env->GetArrayLength(jcolumn_names); + jlong* jco = env->GetLongArrayElements(jcolumn_options, NULL); + for(int i = 0; i < len_cols; i++) { + jobject jcn = env->GetObjectArrayElement(jcolumn_names, i); + jbyteArray jcn_ba = reinterpret_cast(jcn); + jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, NULL); + const int jcf_name_len = env->GetArrayLength(jcn_ba); + + //TODO(AR) do I need to make a copy of jco[i] ? + + std::string cf_name (reinterpret_cast(jcf_name), jcf_name_len); + rocksdb::ColumnFamilyOptions* cf_options = + reinterpret_cast(jco[i]); + column_families.push_back( + rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options)); + + env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT); + env->DeleteLocalRef(jcn); } - // get iterator for TTL values - iteratorObj = env->CallObjectMethod( - jttl_list, rocksdb::ListJni::getIteratorMethod(env)); - // iterate over TTL values - while (env->CallBooleanMethod( - iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { - // get TTL object - jobject jttl_object = env->CallObjectMethod(iteratorObj, - rocksdb::ListJni::getNextMethod(env)); - // get Integer value - jclass jIntClazz = env->FindClass("java/lang/Integer"); - jmethodID getVal = env->GetMethodID(jIntClazz, "intValue", "()I"); - ttl_values.push_back(env->CallIntMethod(jttl_object, getVal)); + env->ReleaseLongArrayElements(jcolumn_options, jco, JNI_ABORT); + + std::vector handles; + rocksdb::DBWithTTL* db = nullptr; + + std::vector ttl_values; + jint* jttlv = env->GetIntArrayElements(jttls, NULL); + jsize len_ttls = env->GetArrayLength(jttls); + for(int i = 0; i < len_ttls; i++) { + ttl_values.push_back(jttlv[i]); } + env->ReleaseIntArrayElements(jttls, jttlv, JNI_ABORT); + rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, column_families, &handles, &db, ttl_values, jread_only); - env->ReleaseStringUTFChars(jdb_path, db_path); - // free jbyte allocations - for (std::vector::size_type i = 0; - i != cfnames_to_free.size(); i++) { - // free cfnames - env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0); - } - // check if open operation was successful if (s.ok()) { - rocksdb::RocksDBJni::setHandle(env, jdb, db); - jclass jListClazz = env->FindClass("java/util/ArrayList"); - jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( - env, jListClazz); - jobject jcfhandle_list = env->NewObject(jListClazz, - midList, handles.size()); - // insert in java list - for (std::vector::size_type i = 0; - i != handles.size(); i++) { - // jlong must be converted to Long due to collections restrictions - jclass jLongClazz = env->FindClass("java/lang/Long"); - jmethodID midLong = env->GetMethodID(jLongClazz, "", "(J)V"); - jobject obj = env->NewObject(jLongClazz, midLong, - reinterpret_cast(handles[i])); - env->CallBooleanMethod(jcfhandle_list, - rocksdb::ListJni::getListAddMethodId(env), obj); + jsize resultsLen = 1 + len_cols; //db handle + column family handles + std::unique_ptr results = + std::unique_ptr(new jlong[resultsLen]); + results[0] = reinterpret_cast(db); + for(int i = 1; i <= len_cols; i++) { + results[i] = reinterpret_cast(handles[i - 1]); } - return jcfhandle_list; + jlongArray jresults = env->NewLongArray(resultsLen); + env->SetLongArrayRegion(jresults, 0, resultsLen, results.get()); + return jresults; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return NULL; } - rocksdb::RocksDBExceptionJni::ThrowNew(env, s); - return nullptr; } /* * Class: org_rocksdb_TtlDB * Method: createColumnFamilyWithTtl - * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;I)J; + * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;[BJI)J; */ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl( JNIEnv* env, jobject jobj, jlong jdb_handle, - jobject jcf_descriptor, jint jttl) { + jbyteArray jcolumn_name, jlong jcolumn_options, jint jttl) { rocksdb::ColumnFamilyHandle* handle; auto* db_handle = reinterpret_cast(jdb_handle); - // get ColumnFamilyName - jbyteArray byteArray = static_cast(env->CallObjectMethod( - jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( - env))); - // get CF Options - jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, - rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( - env)); - rocksdb::ColumnFamilyOptions* cfOptions = - rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); - - jbyte* cfname = env->GetByteArrayElements(byteArray, 0); - const int len = env->GetArrayLength(byteArray); + jbyte* cfname = env->GetByteArrayElements(jcolumn_name, 0); + const int len = env->GetArrayLength(jcolumn_name); + + auto* cfOptions = + reinterpret_cast(jcolumn_options); rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl( *cfOptions, std::string(reinterpret_cast(cfname), len), &handle, jttl); - env->ReleaseByteArrayElements(byteArray, cfname, 0); + env->ReleaseByteArrayElements(jcolumn_name, cfname, 0); if (s.ok()) { return reinterpret_cast(handle); diff --git a/external/rocksdb/java/rocksjni/write_batch.cc b/external/rocksdb/java/rocksjni/write_batch.cc index aa0c2309aa..2ae9587025 100644 --- a/external/rocksdb/java/rocksjni/write_batch.cc +++ b/external/rocksdb/java/rocksjni/write_batch.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,43 +7,43 @@ // calling c++ rocksdb::WriteBatch methods from Java side. #include +#include "db/memtable.h" +#include "db/write_batch_internal.h" #include "include/org_rocksdb_WriteBatch.h" #include "include/org_rocksdb_WriteBatch_Handler.h" -#include "rocksjni/portal.h" -#include "rocksjni/writebatchhandlerjnicallback.h" #include "rocksdb/db.h" -#include "rocksdb/immutable_options.h" -#include "db/memtable.h" -#include "rocksdb/write_batch.h" -#include "rocksdb/status.h" -#include "db/write_batch_internal.h" -#include "db/writebuffer.h" #include "rocksdb/env.h" +#include "rocksdb/immutable_options.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" +#include "rocksjni/portal.h" +#include "rocksjni/writebatchhandlerjnicallback.h" +#include "table/scoped_arena_iterator.h" #include "util/logging.h" -#include "util/scoped_arena_iterator.h" #include "util/testharness.h" /* * Class: org_rocksdb_WriteBatch * Method: newWriteBatch - * Signature: (I)V + * Signature: (I)J */ -void Java_org_rocksdb_WriteBatch_newWriteBatch( - JNIEnv* env, jobject jobj, jint jreserved_bytes) { +jlong Java_org_rocksdb_WriteBatch_newWriteBatch( + JNIEnv* env, jclass jcls, jint jreserved_bytes) { rocksdb::WriteBatch* wb = new rocksdb::WriteBatch( static_cast(jreserved_bytes)); - - rocksdb::WriteBatchJni::setHandle(env, jobj, wb); + return reinterpret_cast(wb); } /* * Class: org_rocksdb_WriteBatch * Method: count0 - * Signature: ()I + * Signature: (J)I */ -jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); +jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj, + jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); return static_cast(wb->Count()); @@ -52,25 +52,57 @@ jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj) { /* * Class: org_rocksdb_WriteBatch * Method: clear0 - * Signature: ()V + * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj, + jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); wb->Clear(); } +/* + * Class: org_rocksdb_WriteBatch + * Method: setSavePoint0 + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatch_setSavePoint0( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + wb->SetSavePoint(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: rollbackToSavePoint0 + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + auto s = wb->RollbackToSavePoint(); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + /* * Class: org_rocksdb_WriteBatch * Method: put - * Signature: ([BI[BI)V + * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatch_put___3BI_3BI( - JNIEnv* env, jobject jobj, +void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto put = [&wb] (rocksdb::Slice key, rocksdb::Slice value) { wb->Put(key, value); @@ -82,13 +114,13 @@ void Java_org_rocksdb_WriteBatch_put___3BI_3BI( /* * Class: org_rocksdb_WriteBatch * Method: put - * Signature: ([BI[BIJ)V + * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ( - JNIEnv* env, jobject jobj, +void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); @@ -102,13 +134,13 @@ void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ( /* * Class: org_rocksdb_WriteBatch * Method: merge - * Signature: ([BI[BI)V + * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatch_merge___3BI_3BI( - JNIEnv* env, jobject jobj, +void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto merge = [&wb] (rocksdb::Slice key, rocksdb::Slice value) { wb->Merge(key, value); @@ -120,13 +152,13 @@ void Java_org_rocksdb_WriteBatch_merge___3BI_3BI( /* * Class: org_rocksdb_WriteBatch * Method: merge - * Signature: ([BI[BIJ)V + * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ( - JNIEnv* env, jobject jobj, +void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); @@ -140,12 +172,12 @@ void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ( /* * Class: org_rocksdb_WriteBatch * Method: remove - * Signature: ([BI)V + * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatch_remove___3BI( - JNIEnv* env, jobject jobj, +void Java_org_rocksdb_WriteBatch_remove__J_3BI( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto remove = [&wb] (rocksdb::Slice key) { wb->Delete(key); @@ -156,12 +188,12 @@ void Java_org_rocksdb_WriteBatch_remove___3BI( /* * Class: org_rocksdb_WriteBatch * Method: remove - * Signature: ([BIJ)V + * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatch_remove___3BIJ( - JNIEnv* env, jobject jobj, +void Java_org_rocksdb_WriteBatch_remove__J_3BIJ( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); @@ -174,11 +206,12 @@ void Java_org_rocksdb_WriteBatch_remove___3BIJ( /* * Class: org_rocksdb_WriteBatch * Method: putLogData - * Signature: ([BI)V + * Signature: (J[BI)V */ void Java_org_rocksdb_WriteBatch_putLogData( - JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) { - auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jblob, + jint jblob_len) { + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto putLogData = [&wb] (rocksdb::Slice blob) { wb->PutLogData(blob); @@ -189,11 +222,11 @@ void Java_org_rocksdb_WriteBatch_putLogData( /* * Class: org_rocksdb_WriteBatch * Method: iterate - * Signature: (J)V + * Signature: (JJ)V */ void Java_org_rocksdb_WriteBatch_iterate( - JNIEnv* env , jobject jobj, jlong handlerHandle) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + JNIEnv* env , jobject jobj, jlong jwb_handle, jlong handlerHandle) { + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); rocksdb::Status s = wb->Iterate( @@ -218,13 +251,13 @@ void Java_org_rocksdb_WriteBatch_disposeInternal( /* * Class: org_rocksdb_WriteBatch_Handler * Method: createNewHandler0 - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0( +jlong Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0( JNIEnv* env, jobject jobj) { const rocksdb::WriteBatchHandlerJniCallback* h = new rocksdb::WriteBatchHandlerJniCallback(env, jobj); - rocksdb::WriteBatchHandlerJni::setHandle(env, jobj, h); + return reinterpret_cast(h); } /* diff --git a/external/rocksdb/java/rocksjni/write_batch_test.cc b/external/rocksdb/java/rocksjni/write_batch_test.cc index d54029141e..371744e4f9 100644 --- a/external/rocksdb/java/rocksjni/write_batch_test.cc +++ b/external/rocksdb/java/rocksjni/write_batch_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,30 +9,30 @@ #include "db/memtable.h" #include "db/write_batch_internal.h" -#include "db/writebuffer.h" #include "include/org_rocksdb_WriteBatch.h" -#include "include/org_rocksdb_WriteBatch_Handler.h" #include "include/org_rocksdb_WriteBatchTest.h" #include "include/org_rocksdb_WriteBatchTestInternalHelper.h" +#include "include/org_rocksdb_WriteBatch_Handler.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/immutable_options.h" #include "rocksdb/memtablerep.h" #include "rocksdb/status.h" #include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" #include "rocksjni/portal.h" +#include "table/scoped_arena_iterator.h" #include "util/logging.h" -#include "util/scoped_arena_iterator.h" #include "util/testharness.h" /* * Class: org_rocksdb_WriteBatchTest * Method: getContents - * Signature: (Lorg/rocksdb/WriteBatch;)[B + * Signature: (J)[B */ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( - JNIEnv* env, jclass jclazz, jobject jobj) { - rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj); + JNIEnv* env, jclass jclazz, jlong jwb_handle) { + auto* b = reinterpret_cast(jwb_handle); assert(b != nullptr); // todo: Currently the following code is directly copied from @@ -42,7 +42,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator()); auto factory = std::make_shared(); rocksdb::Options options; - rocksdb::WriteBuffer wb(options.db_write_buffer_size); + rocksdb::WriteBufferManager wb(options.db_write_buffer_size); options.memtable_factory = factory; rocksdb::MemTable* mem = new rocksdb::MemTable( cmp, rocksdb::ImmutableCFOptions(options), @@ -52,7 +52,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( std::string state; rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem); rocksdb::Status s = - rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); + rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr); int count = 0; rocksdb::Arena arena; rocksdb::ScopedArenaIterator iter(mem->NewIterator( @@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { rocksdb::ParsedInternalKey ikey; memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); - assert(rocksdb::ParseInternalKey(iter->key(), &ikey)); + bool parsed = rocksdb::ParseInternalKey(iter->key(), &ikey); + assert(parsed); switch (ikey.type) { case rocksdb::kTypeValue: state.append("Put("); @@ -108,11 +109,11 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( /* * Class: org_rocksdb_WriteBatchTestInternalHelper * Method: setSequence - * Signature: (Lorg/rocksdb/WriteBatch;J)V + * Signature: (JJ)V */ void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence( - JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + JNIEnv* env, jclass jclazz, jlong jwb_handle, jlong jsn) { + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); rocksdb::WriteBatchInternal::SetSequence( @@ -122,11 +123,11 @@ void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence( /* * Class: org_rocksdb_WriteBatchTestInternalHelper * Method: sequence - * Signature: (Lorg/rocksdb/WriteBatch;)J + * Signature: (J)J */ jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence( - JNIEnv* env, jclass jclazz, jobject jobj) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + JNIEnv* env, jclass jclazz, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); return static_cast(rocksdb::WriteBatchInternal::Sequence(wb)); @@ -135,13 +136,13 @@ jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence( /* * Class: org_rocksdb_WriteBatchTestInternalHelper * Method: append - * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V + * Signature: (JJ)V */ void Java_org_rocksdb_WriteBatchTestInternalHelper_append( - JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) { - rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1); + JNIEnv* env, jclass jclazz, jlong jwb_handle_1, jlong jwb_handle_2) { + auto* wb1 = reinterpret_cast(jwb_handle_1); assert(wb1 != nullptr); - rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2); + auto* wb2 = reinterpret_cast(jwb_handle_2); assert(wb2 != nullptr); rocksdb::WriteBatchInternal::Append(wb1, wb2); diff --git a/external/rocksdb/java/rocksjni/write_batch_with_index.cc b/external/rocksdb/java/rocksjni/write_batch_with_index.cc index 7c57a0e061..2b8cf778b6 100644 --- a/external/rocksdb/java/rocksjni/write_batch_with_index.cc +++ b/external/rocksdb/java/rocksjni/write_batch_with_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,51 +15,50 @@ /* * Class: org_rocksdb_WriteBatchWithIndex * Method: newWriteBatchWithIndex - * Signature: ()V + * Signature: ()J */ -void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__( + JNIEnv* env, jclass jcls) { rocksdb::WriteBatchWithIndex* wbwi = new rocksdb::WriteBatchWithIndex(); - rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi); + return reinterpret_cast(wbwi); } /* * Class: org_rocksdb_WriteBatchWithIndex * Method: newWriteBatchWithIndex - * Signature: (Z)V + * Signature: (Z)J */ -void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z( - JNIEnv* env, jobject jobj, jboolean joverwrite_key) { +jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z( + JNIEnv* env, jclass jcls, jboolean joverwrite_key) { rocksdb::WriteBatchWithIndex* wbwi = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0, static_cast(joverwrite_key)); - rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi); + return reinterpret_cast(wbwi); } /* * Class: org_rocksdb_WriteBatchWithIndex * Method: newWriteBatchWithIndex - * Signature: (JIZ)V + * Signature: (JIZ)J */ -void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ( - JNIEnv* env, jobject jobj, jlong jfallback_index_comparator_handle, +jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ( + JNIEnv* env, jclass jcls, jlong jfallback_index_comparator_handle, jint jreserved_bytes, jboolean joverwrite_key) { rocksdb::WriteBatchWithIndex* wbwi = new rocksdb::WriteBatchWithIndex( reinterpret_cast(jfallback_index_comparator_handle), static_cast(jreserved_bytes), static_cast(joverwrite_key)); - rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi); + return reinterpret_cast(wbwi); } /* * Class: org_rocksdb_WriteBatchWithIndex - * Method: count - * Signature: ()I + * Method: count0 + * Signature: (J)I */ jint Java_org_rocksdb_WriteBatchWithIndex_count0( - JNIEnv* env, jobject jobj) { - rocksdb::WriteBatchWithIndex* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); return static_cast(wbwi->GetWriteBatch()->Count()); @@ -68,13 +67,12 @@ jint Java_org_rocksdb_WriteBatchWithIndex_count0( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: put - * Signature: ([BI[BI)V + * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI( - JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jentry_value, jint jentry_value_len) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto put = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) { wbwi->Put(key, value); @@ -86,13 +84,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: put - * Signature: ([BI[BIJ)V + * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BIJ( - JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, + jlong jcf_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); @@ -106,13 +104,12 @@ void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BIJ( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: merge - * Signature: ([BI[BI)V + * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI( - JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jentry_value, jint jentry_value_len) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto merge = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) { wbwi->Merge(key, value); @@ -124,13 +121,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: merge - * Signature: ([BI[BIJ)V + * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BIJ( - JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, + jlong jcf_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); @@ -144,12 +141,12 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BIJ( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: remove - * Signature: ([BI)V + * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI( - JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BI( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto remove = [&wbwi] (rocksdb::Slice key) { wbwi->Delete(key); @@ -160,13 +157,12 @@ void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: remove - * Signature: ([BIJ)V + * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ( - JNIEnv* env, jobject jobj, - jbyteArray jkey, jint jkey_len, jlong jcf_handle) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); +void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BIJ( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len, jlong jcf_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); @@ -179,12 +175,12 @@ void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: putLogData - * Signature: ([BI)V + * Signature: (J[BI)V */ void Java_org_rocksdb_WriteBatchWithIndex_putLogData( - JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) { - auto* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jblob, + jint jblob_len) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto putLogData = [&wbwi] (rocksdb::Slice blob) { wbwi->PutLogData(blob); @@ -195,26 +191,56 @@ void Java_org_rocksdb_WriteBatchWithIndex_putLogData( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: clear - * Signature: ()V + * Signature: (J)V */ void Java_org_rocksdb_WriteBatchWithIndex_clear0( - JNIEnv* env, jobject jobj) { - rocksdb::WriteBatchWithIndex* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); - wbwi->GetWriteBatch()->Clear(); + wbwi->Clear(); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: setSavePoint0 + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0( + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + + wbwi->SetSavePoint(); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: rollbackToSavePoint0 + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0( + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + + auto s = wbwi->RollbackToSavePoint(); + + if (s.ok()) { + return; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } /* * Class: org_rocksdb_WriteBatchWithIndex * Method: iterator0 - * Signature: ()J + * Signature: (J)J */ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0( - JNIEnv* env, jobject jobj) { - rocksdb::WriteBatchWithIndex* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator(); return reinterpret_cast(wbwi_iterator); } @@ -222,12 +248,11 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: iterator1 - * Signature: (J)J + * Signature: (JJ)J */ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1( - JNIEnv* env, jobject jobj, jlong jcf_handle) { - rocksdb::WriteBatchWithIndex* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jcf_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); auto* cf_handle = reinterpret_cast(jcf_handle); rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator(cf_handle); return reinterpret_cast(wbwi_iterator); @@ -236,12 +261,12 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: iteratorWithBase - * Signature: (JJ)J + * Signature: (JJJ)J */ jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase( - JNIEnv* env, jobject jobj, jlong jcf_handle, jlong jbi_handle) { - rocksdb::WriteBatchWithIndex* wbwi = - rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jcf_handle, + jlong jbi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); auto* cf_handle = reinterpret_cast(jcf_handle); auto* base_iterator = reinterpret_cast(jbi_handle); auto* iterator = wbwi->NewIteratorWithBase(cf_handle, base_iterator); @@ -360,27 +385,57 @@ void Java_org_rocksdb_WBWIRocksIterator_status0( /* * Class: org_rocksdb_WBWIRocksIterator * Method: entry1 - * Signature: (JLorg/rocksdb/WBWIRocksIterator/WriteEntry;)V + * Signature: (J)[J */ -void Java_org_rocksdb_WBWIRocksIterator_entry1( - JNIEnv* env, jobject jobj, jlong handle, jobject jwrite_entry) { +jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1( + JNIEnv* env, jobject jobj, jlong handle) { auto* it = reinterpret_cast(handle); const rocksdb::WriteEntry& we = it->Entry(); - jobject jwe = rocksdb::WBWIRocksIteratorJni::getWriteEntry(env, jobj); - rocksdb::WriteEntryJni::setWriteType(env, jwe, we.type); + jlong results[3]; + + //set the type of the write entry + switch (we.type) { + case rocksdb::kPutRecord: + results[0] = 0x1; + break; + + case rocksdb::kMergeRecord: + results[0] = 0x2; + break; + + case rocksdb::kDeleteRecord: + results[0] = 0x4; + break; + + case rocksdb::kLogDataRecord: + results[0] = 0x8; + break; + + default: + results[0] = 0x0; + } + + //TODO(AR) do we leak buf and value_buf? + + //set the pointer to the key slice char* buf = new char[we.key.size()]; memcpy(buf, we.key.data(), we.key.size()); auto* key_slice = new rocksdb::Slice(buf, we.key.size()); - rocksdb::WriteEntryJni::setKey(env, jwe, key_slice); + results[1] = reinterpret_cast(key_slice); + //set the pointer to the value slice if (we.type == rocksdb::kDeleteRecord || we.type == rocksdb::kLogDataRecord) { // set native handle of value slice to null if no value available - rocksdb::WriteEntryJni::setValue(env, jwe, nullptr); + results[2] = 0; } else { char* value_buf = new char[we.value.size()]; memcpy(value_buf, we.value.data(), we.value.size()); auto* value_slice = new rocksdb::Slice(value_buf, we.value.size()); - rocksdb::WriteEntryJni::setValue(env, jwe, value_slice); + results[2] = reinterpret_cast(value_slice); } + + jlongArray jresults = env->NewLongArray(3); + env->SetLongArrayRegion(jresults, 0, 3, results); + return jresults; } diff --git a/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc b/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc index b12e355448..b252365184 100644 --- a/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc +++ b/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h b/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h index 9a2a47e80c..1c421db030 100644 --- a/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h +++ b/external/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java b/external/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java index da9f4d28bd..7dd0de9cc4 100644 --- a/external/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java +++ b/external/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,73 +22,57 @@ public static void main(String[] args) throws RocksDBException { String db_path = args[0]; System.out.println("RocksDBColumnFamilySample"); - RocksDB db = null; - Options options = null; - ColumnFamilyHandle columnFamilyHandle = null; - WriteBatch wb = null; - try { - options = new Options().setCreateIfMissing(true); - db = RocksDB.open(options, db_path); + try(final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, db_path)) { + assert(db != null); // create column family - columnFamilyHandle = db.createColumnFamily( + try(final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily( new ColumnFamilyDescriptor("new_cf".getBytes(), - new ColumnFamilyOptions())); - assert(columnFamilyHandle != null); - - } finally { - if (columnFamilyHandle != null) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - db = null; - } - if (options != null) { - options.dispose(); + new ColumnFamilyOptions()))) { + assert (columnFamilyHandle != null); } } // open DB with two column families - List columnFamilyDescriptors = new ArrayList<>(); + final List columnFamilyDescriptors = + new ArrayList<>(); // have to open default column family columnFamilyDescriptors.add(new ColumnFamilyDescriptor( RocksDB.DEFAULT_COLUMN_FAMILY, new ColumnFamilyOptions())); // open the new one, too columnFamilyDescriptors.add(new ColumnFamilyDescriptor( "new_cf".getBytes(), new ColumnFamilyOptions())); - List columnFamilyHandles = new ArrayList<>(); - try { - db = RocksDB.open(new DBOptions(), db_path, - columnFamilyDescriptors, columnFamilyHandles); + final List columnFamilyHandles = new ArrayList<>(); + try(final DBOptions options = new DBOptions(); + final RocksDB db = RocksDB.open(options, db_path, + columnFamilyDescriptors, columnFamilyHandles)) { assert(db != null); - // put and get from non-default column family - db.put(columnFamilyHandles.get(0), new WriteOptions(), - "key".getBytes(), "value".getBytes()); - String value = new String(db.get(columnFamilyHandles.get(0), - "key".getBytes())); + try { + // put and get from non-default column family + db.put(columnFamilyHandles.get(0), new WriteOptions(), + "key".getBytes(), "value".getBytes()); + String value = new String(db.get(columnFamilyHandles.get(0), + "key".getBytes())); - // atomic write - wb = new WriteBatch(); - wb.put(columnFamilyHandles.get(0), "key2".getBytes(), "value2".getBytes()); - wb.put(columnFamilyHandles.get(1), "key3".getBytes(), "value3".getBytes()); - wb.remove(columnFamilyHandles.get(0), "key".getBytes()); - db.write(new WriteOptions(), wb); + // atomic write + try (final WriteBatch wb = new WriteBatch()) { + wb.put(columnFamilyHandles.get(0), "key2".getBytes(), + "value2".getBytes()); + wb.put(columnFamilyHandles.get(1), "key3".getBytes(), + "value3".getBytes()); + wb.remove(columnFamilyHandles.get(0), "key".getBytes()); + db.write(new WriteOptions(), wb); + } - // drop column family - db.dropColumnFamily(columnFamilyHandles.get(1)); - - } finally { - for (ColumnFamilyHandle handle : columnFamilyHandles){ - handle.dispose(); - } - if (db != null) { - db.close(); - } - if (wb != null) { - wb.dispose(); + // drop column family + db.dropColumnFamily(columnFamilyHandles.get(1)); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } } } } diff --git a/external/rocksdb/java/samples/src/main/java/RocksDBSample.java b/external/rocksdb/java/samples/src/main/java/RocksDBSample.java index 402fd8f892..de5bc26d5d 100644 --- a/external/rocksdb/java/samples/src/main/java/RocksDBSample.java +++ b/external/rocksdb/java/samples/src/main/java/RocksDBSample.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,8 +8,10 @@ import java.util.List; import java.util.Map; import java.util.ArrayList; + import org.rocksdb.*; import org.rocksdb.util.SizeUnit; + import java.io.IOException; public class RocksDBSample { @@ -26,287 +28,273 @@ public static void main(String[] args) { String db_path_not_found = db_path + "_not_found"; System.out.println("RocksDBSample"); - RocksDB db = null; - Options options = new Options(); - try { - db = RocksDB.open(options, db_path_not_found); - assert(false); - } catch (RocksDBException e) { - System.out.format("caught the expceted exception -- %s\n", e); - assert(db == null); - } + try (final Options options = new Options(); + final Filter bloomFilter = new BloomFilter(10); + final ReadOptions readOptions = new ReadOptions() + .setFillCache(false)) { + + try (final RocksDB db = RocksDB.open(options, db_path_not_found)) { + assert (false); + } catch (RocksDBException e) { + System.out.format("caught the expected exception -- %s\n", e); + } - try { - options.setCreateIfMissing(true) - .createStatistics() - .setWriteBufferSize(8 * SizeUnit.KB) - .setMaxWriteBufferNumber(3) - .setMaxBackgroundCompactions(10) - .setCompressionType(CompressionType.SNAPPY_COMPRESSION) - .setCompactionStyle(CompactionStyle.UNIVERSAL); - } catch (IllegalArgumentException e) { - assert(false); - } + try { + options.setCreateIfMissing(true) + .createStatistics() + .setWriteBufferSize(8 * SizeUnit.KB) + .setMaxWriteBufferNumber(3) + .setMaxBackgroundCompactions(10) + .setCompressionType(CompressionType.SNAPPY_COMPRESSION) + .setCompactionStyle(CompactionStyle.UNIVERSAL); + } catch (IllegalArgumentException e) { + assert (false); + } - Statistics stats = options.statisticsPtr(); - - assert(options.createIfMissing() == true); - assert(options.writeBufferSize() == 8 * SizeUnit.KB); - assert(options.maxWriteBufferNumber() == 3); - assert(options.maxBackgroundCompactions() == 10); - assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION); - assert(options.compactionStyle() == CompactionStyle.UNIVERSAL); - - assert(options.memTableFactoryName().equals("SkipListFactory")); - options.setMemTableConfig( - new HashSkipListMemTableConfig() - .setHeight(4) - .setBranchingFactor(4) - .setBucketCount(2000000)); - assert(options.memTableFactoryName().equals("HashSkipListRepFactory")); - - options.setMemTableConfig( - new HashLinkedListMemTableConfig() - .setBucketCount(100000)); - assert(options.memTableFactoryName().equals("HashLinkedListRepFactory")); - - options.setMemTableConfig( - new VectorMemTableConfig().setReservedSize(10000)); - assert(options.memTableFactoryName().equals("VectorRepFactory")); - - options.setMemTableConfig(new SkipListMemTableConfig()); - assert(options.memTableFactoryName().equals("SkipListFactory")); - - options.setTableFormatConfig(new PlainTableConfig()); - // Plain-Table requires mmap read - options.setAllowMmapReads(true); - assert(options.tableFactoryName().equals("PlainTable")); - - options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000, - 10000, 10)); - options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000)); - - - Filter bloomFilter = new BloomFilter(10); - BlockBasedTableConfig table_options = new BlockBasedTableConfig(); - table_options.setBlockCacheSize(64 * SizeUnit.KB) - .setFilter(bloomFilter) - .setCacheNumShardBits(6) - .setBlockSizeDeviation(5) - .setBlockRestartInterval(10) - .setCacheIndexAndFilterBlocks(true) - .setHashIndexAllowCollision(false) - .setBlockCacheCompressedSize(64 * SizeUnit.KB) - .setBlockCacheCompressedNumShardBits(10); - - assert(table_options.blockCacheSize() == 64 * SizeUnit.KB); - assert(table_options.cacheNumShardBits() == 6); - assert(table_options.blockSizeDeviation() == 5); - assert(table_options.blockRestartInterval() == 10); - assert(table_options.cacheIndexAndFilterBlocks() == true); - assert(table_options.hashIndexAllowCollision() == false); - assert(table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB); - assert(table_options.blockCacheCompressedNumShardBits() == 10); - - options.setTableFormatConfig(table_options); - assert(options.tableFactoryName().equals("BlockBasedTable")); - - try { - db = RocksDB.open(options, db_path); - db.put("hello".getBytes(), "world".getBytes()); - byte[] value = db.get("hello".getBytes()); - assert("world".equals(new String(value))); - String str = db.getProperty("rocksdb.stats"); - assert(str != null && !str.equals("")); - } catch (RocksDBException e) { - System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e); - assert(db == null); - assert(false); - } - // be sure to release the c++ pointer - db.close(); - - ReadOptions readOptions = new ReadOptions(); - readOptions.setFillCache(false); - - try { - db = RocksDB.open(options, db_path); - db.put("hello".getBytes(), "world".getBytes()); - byte[] value = db.get("hello".getBytes()); - System.out.format("Get('hello') = %s\n", - new String(value)); - - for (int i = 1; i <= 9; ++i) { - for (int j = 1; j <= 9; ++j) { - db.put(String.format("%dx%d", i, j).getBytes(), - String.format("%d", i * j).getBytes()); - } + Statistics stats = options.statisticsPtr(); + + assert (options.createIfMissing() == true); + assert (options.writeBufferSize() == 8 * SizeUnit.KB); + assert (options.maxWriteBufferNumber() == 3); + assert (options.maxBackgroundCompactions() == 10); + assert (options.compressionType() == CompressionType.SNAPPY_COMPRESSION); + assert (options.compactionStyle() == CompactionStyle.UNIVERSAL); + + assert (options.memTableFactoryName().equals("SkipListFactory")); + options.setMemTableConfig( + new HashSkipListMemTableConfig() + .setHeight(4) + .setBranchingFactor(4) + .setBucketCount(2000000)); + assert (options.memTableFactoryName().equals("HashSkipListRepFactory")); + + options.setMemTableConfig( + new HashLinkedListMemTableConfig() + .setBucketCount(100000)); + assert (options.memTableFactoryName().equals("HashLinkedListRepFactory")); + + options.setMemTableConfig( + new VectorMemTableConfig().setReservedSize(10000)); + assert (options.memTableFactoryName().equals("VectorRepFactory")); + + options.setMemTableConfig(new SkipListMemTableConfig()); + assert (options.memTableFactoryName().equals("SkipListFactory")); + + options.setTableFormatConfig(new PlainTableConfig()); + // Plain-Table requires mmap read + options.setAllowMmapReads(true); + assert (options.tableFactoryName().equals("PlainTable")); + + options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000, + 10000, 10)); + options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000)); + + final BlockBasedTableConfig table_options = new BlockBasedTableConfig(); + table_options.setBlockCacheSize(64 * SizeUnit.KB) + .setFilter(bloomFilter) + .setCacheNumShardBits(6) + .setBlockSizeDeviation(5) + .setBlockRestartInterval(10) + .setCacheIndexAndFilterBlocks(true) + .setHashIndexAllowCollision(false) + .setBlockCacheCompressedSize(64 * SizeUnit.KB) + .setBlockCacheCompressedNumShardBits(10); + + assert (table_options.blockCacheSize() == 64 * SizeUnit.KB); + assert (table_options.cacheNumShardBits() == 6); + assert (table_options.blockSizeDeviation() == 5); + assert (table_options.blockRestartInterval() == 10); + assert (table_options.cacheIndexAndFilterBlocks() == true); + assert (table_options.hashIndexAllowCollision() == false); + assert (table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB); + assert (table_options.blockCacheCompressedNumShardBits() == 10); + + options.setTableFormatConfig(table_options); + assert (options.tableFactoryName().equals("BlockBasedTable")); + + try (final RocksDB db = RocksDB.open(options, db_path)) { + db.put("hello".getBytes(), "world".getBytes()); + byte[] value = db.get("hello".getBytes()); + assert ("world".equals(new String(value))); + String str = db.getProperty("rocksdb.stats"); + assert (str != null && !str.equals("")); + } catch (RocksDBException e) { + System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e); + assert (false); } - for (int i = 1; i <= 9; ++i) { - for (int j = 1; j <= 9; ++j) { - System.out.format("%s ", new String(db.get( - String.format("%dx%d", i, j).getBytes()))); + try (final RocksDB db = RocksDB.open(options, db_path)) { + db.put("hello".getBytes(), "world".getBytes()); + byte[] value = db.get("hello".getBytes()); + System.out.format("Get('hello') = %s\n", + new String(value)); + + for (int i = 1; i <= 9; ++i) { + for (int j = 1; j <= 9; ++j) { + db.put(String.format("%dx%d", i, j).getBytes(), + String.format("%d", i * j).getBytes()); + } + } + + for (int i = 1; i <= 9; ++i) { + for (int j = 1; j <= 9; ++j) { + System.out.format("%s ", new String(db.get( + String.format("%dx%d", i, j).getBytes()))); + } + System.out.println(""); } - System.out.println(""); - } - // write batch test - WriteOptions writeOpt = new WriteOptions(); - for (int i = 10; i <= 19; ++i) { - WriteBatch batch = new WriteBatch(); - for (int j = 10; j <= 19; ++j) { - batch.put(String.format("%dx%d", i, j).getBytes(), + // write batch test + try (final WriteOptions writeOpt = new WriteOptions()) { + for (int i = 10; i <= 19; ++i) { + try (final WriteBatch batch = new WriteBatch()) { + for (int j = 10; j <= 19; ++j) { + batch.put(String.format("%dx%d", i, j).getBytes(), String.format("%d", i * j).getBytes()); + } + db.write(writeOpt, batch); + } + } } - db.write(writeOpt, batch); - batch.dispose(); - } - for (int i = 10; i <= 19; ++i) { - for (int j = 10; j <= 19; ++j) { - assert(new String( - db.get(String.format("%dx%d", i, j).getBytes())).equals( - String.format("%d", i * j))); - System.out.format("%s ", new String(db.get( - String.format("%dx%d", i, j).getBytes()))); + for (int i = 10; i <= 19; ++i) { + for (int j = 10; j <= 19; ++j) { + assert (new String( + db.get(String.format("%dx%d", i, j).getBytes())).equals( + String.format("%d", i * j))); + System.out.format("%s ", new String(db.get( + String.format("%dx%d", i, j).getBytes()))); + } + System.out.println(""); } - System.out.println(""); - } - writeOpt.dispose(); - - value = db.get("1x1".getBytes()); - assert(value != null); - value = db.get("world".getBytes()); - assert(value == null); - value = db.get(readOptions, "world".getBytes()); - assert(value == null); - - byte[] testKey = "asdf".getBytes(); - byte[] testValue = - "asdfghjkl;'?> insufficientArray.length); - len = db.get("asdfjkl;".getBytes(), enoughArray); - assert(len == RocksDB.NOT_FOUND); - len = db.get(testKey, enoughArray); - assert(len == testValue.length); - - len = db.get(readOptions, testKey, insufficientArray); - assert(len > insufficientArray.length); - len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray); - assert(len == RocksDB.NOT_FOUND); - len = db.get(readOptions, testKey, enoughArray); - assert(len == testValue.length); - - db.remove(testKey); - len = db.get(testKey, enoughArray); - assert(len == RocksDB.NOT_FOUND); - - // repeat the test with WriteOptions - WriteOptions writeOpts = new WriteOptions(); - writeOpts.setSync(true); - writeOpts.setDisableWAL(true); - db.put(writeOpts, testKey, testValue); - len = db.get(testKey, enoughArray); - assert(len == testValue.length); - assert(new String(testValue).equals( - new String(enoughArray, 0, len))); - writeOpts.dispose(); - try { - for (TickerType statsType : TickerType.values()) { - stats.getTickerCount(statsType); + value = db.get("1x1".getBytes()); + assert (value != null); + value = db.get("world".getBytes()); + assert (value == null); + value = db.get(readOptions, "world".getBytes()); + assert (value == null); + + byte[] testKey = "asdf".getBytes(); + byte[] testValue = + "asdfghjkl;'?> insufficientArray.length); + len = db.get("asdfjkl;".getBytes(), enoughArray); + assert (len == RocksDB.NOT_FOUND); + len = db.get(testKey, enoughArray); + assert (len == testValue.length); + + len = db.get(readOptions, testKey, insufficientArray); + assert (len > insufficientArray.length); + len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray); + assert (len == RocksDB.NOT_FOUND); + len = db.get(readOptions, testKey, enoughArray); + assert (len == testValue.length); + + db.remove(testKey); + len = db.get(testKey, enoughArray); + assert (len == RocksDB.NOT_FOUND); + + // repeat the test with WriteOptions + try (final WriteOptions writeOpts = new WriteOptions()) { + writeOpts.setSync(true); + writeOpts.setDisableWAL(true); + db.put(writeOpts, testKey, testValue); + len = db.get(testKey, enoughArray); + assert (len == testValue.length); + assert (new String(testValue).equals( + new String(enoughArray, 0, len))); } - System.out.println("getTickerCount() passed."); - } catch (Exception e) { - System.out.println("Failed in call to getTickerCount()"); - assert(false); //Should never reach here. - } - try { - for (HistogramType histogramType : HistogramType.values()) { - HistogramData data = stats.geHistogramData(histogramType); + try { + for (TickerType statsType : TickerType.values()) { + stats.getTickerCount(statsType); + } + System.out.println("getTickerCount() passed."); + } catch (Exception e) { + System.out.println("Failed in call to getTickerCount()"); + assert (false); //Should never reach here. } - System.out.println("geHistogramData() passed."); - } catch (Exception e) { - System.out.println("Failed in call to geHistogramData()"); - assert(false); //Should never reach here. - } - - RocksIterator iterator = db.newIterator(); - - boolean seekToFirstPassed = false; - for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) { - iterator.status(); - assert(iterator.key() != null); - assert(iterator.value() != null); - seekToFirstPassed = true; - } - if(seekToFirstPassed) { - System.out.println("iterator seekToFirst tests passed."); - } - - boolean seekToLastPassed = false; - for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) { - iterator.status(); - assert(iterator.key() != null); - assert(iterator.value() != null); - seekToLastPassed = true; - } - - if(seekToLastPassed) { - System.out.println("iterator seekToLastPassed tests passed."); - } - iterator.seekToFirst(); - iterator.seek(iterator.key()); - assert(iterator.key() != null); - assert(iterator.value() != null); + try { + for (HistogramType histogramType : HistogramType.values()) { + HistogramData data = stats.geHistogramData(histogramType); + } + System.out.println("geHistogramData() passed."); + } catch (Exception e) { + System.out.println("Failed in call to geHistogramData()"); + assert (false); //Should never reach here. + } - System.out.println("iterator seek test passed."); + try (final RocksIterator iterator = db.newIterator()) { + + boolean seekToFirstPassed = false; + for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) { + iterator.status(); + assert (iterator.key() != null); + assert (iterator.value() != null); + seekToFirstPassed = true; + } + if (seekToFirstPassed) { + System.out.println("iterator seekToFirst tests passed."); + } + + boolean seekToLastPassed = false; + for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) { + iterator.status(); + assert (iterator.key() != null); + assert (iterator.value() != null); + seekToLastPassed = true; + } + + if (seekToLastPassed) { + System.out.println("iterator seekToLastPassed tests passed."); + } + + iterator.seekToFirst(); + iterator.seek(iterator.key()); + assert (iterator.key() != null); + assert (iterator.value() != null); + + System.out.println("iterator seek test passed."); - iterator.dispose(); - System.out.println("iterator tests passed."); + } + System.out.println("iterator tests passed."); - iterator = db.newIterator(); - List keys = new ArrayList(); - for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) { - keys.add(iterator.key()); - } - iterator.dispose(); + final List keys = new ArrayList<>(); + try (final RocksIterator iterator = db.newIterator()) { + for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) { + keys.add(iterator.key()); + } + } - Map values = db.multiGet(keys); - assert(values.size() == keys.size()); - for(byte[] value1 : values.values()) { - assert(value1 != null); - } + Map values = db.multiGet(keys); + assert (values.size() == keys.size()); + for (byte[] value1 : values.values()) { + assert (value1 != null); + } - values = db.multiGet(new ReadOptions(), keys); - assert(values.size() == keys.size()); - for(byte[] value1 : values.values()) { - assert(value1 != null); + values = db.multiGet(new ReadOptions(), keys); + assert (values.size() == keys.size()); + for (byte[] value1 : values.values()) { + assert (value1 != null); + } + } catch (RocksDBException e) { + System.err.println(e); } - } catch (RocksDBException e) { - System.err.println(e); - } - if (db != null) { - db.close(); } - // be sure to dispose c++ pointers - options.dispose(); - readOptions.dispose(); } } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java index 2b78deddb0..7d3c5bcd92 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,22 +8,23 @@ * A CompactionFilter allows an application to modify/delete a key-value at * the time of compaction. * - * At present we just permit an overriding Java class to wrap a C++ implementation + * At present we just permit an overriding Java class to wrap a C++ + * implementation */ public abstract class AbstractCompactionFilter> extends RocksObject { + protected AbstractCompactionFilter(final long nativeHandle) { + super(nativeHandle); + } + /** - * Deletes underlying C++ comparator pointer. + * Deletes underlying C++ compaction pointer. * * Note that this function should be called only after all - * RocksDB instances referencing the comparator are closed. + * RocksDB instances referencing the compaction filter are closed. * Otherwise an undefined behavior will occur. */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - private native void disposeInternal(long handle); + @Override + protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java index c2412d7f2c..78ee371658 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,7 +15,11 @@ * @see org.rocksdb.DirectComparator */ public abstract class AbstractComparator> - extends RocksObject { + extends AbstractImmutableNativeReference { + + protected AbstractComparator() { + super(true); + } /** * The name of the comparator. Used to check for comparator @@ -91,10 +95,12 @@ public String findShortSuccessor(final String key) { * RocksDB instances referencing the comparator are closed. * Otherwise an undefined behavior will occur. */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); + @Override + protected void disposeInternal() { + disposeInternal(getNativeHandle()); } - private native void disposeInternal(long handle); + protected abstract long getNativeHandle(); + + private native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java new file mode 100644 index 0000000000..b0af31ac37 --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java @@ -0,0 +1,66 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Offers functionality for implementations of + * {@link AbstractNativeReference} which have an immutable reference to the + * underlying native C++ object + */ +public abstract class AbstractImmutableNativeReference + extends AbstractNativeReference { + + /** + * A flag indicating whether the current {@code AbstractNativeReference} is + * responsible to free the underlying C++ object + */ + private final AtomicBoolean owningHandle_; + + protected AbstractImmutableNativeReference(final boolean owningHandle) { + this.owningHandle_ = new AtomicBoolean(owningHandle); + } + + @Override + public boolean isOwningHandle() { + return owningHandle_.get(); + } + + /** + * Releases this {@code AbstractNativeReference} from the responsibility of + * freeing the underlying native C++ object + *

+ * This will prevent the object from attempting to delete the underlying + * native object in its finalizer. This must be used when another object + * takes over ownership of the native object or both will attempt to delete + * the underlying object when garbage collected. + *

+ * When {@code disOwnNativeHandle()} is called, {@code dispose()} will + * subsequently take no action. As a result, incorrect use of this function + * may cause a memory leak. + *

+ * + * @see #dispose() + */ + protected final void disOwnNativeHandle() { + owningHandle_.set(false); + } + + @Override + public void close() { + if (owningHandle_.compareAndSet(true, false)) { + disposeInternal(); + } + } + + /** + * The helper function of {@link AbstractImmutableNativeReference#dispose()} + * which all subclasses of {@code AbstractImmutableNativeReference} must + * implement to release their underlying native C++ objects. + */ + protected abstract void disposeInternal(); +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java new file mode 100644 index 0000000000..c5aae48909 --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java @@ -0,0 +1,76 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * AbstractNativeReference is the base-class of all RocksDB classes that have + * a pointer to a native C++ {@code rocksdb} object. + *

+ * AbstractNativeReference has the {@link AbstractNativeReference#dispose()} + * method, which frees its associated C++ object.

+ *

+ * This function should be called manually, however, if required it will be + * called automatically during the regular Java GC process via + * {@link AbstractNativeReference#finalize()}.

+ *

+ * Note - Java can only see the long member variable (which is the C++ pointer + * value to the native object), as such it does not know the real size of the + * object and therefore may assign a low GC priority for it; So it is strongly + * suggested that you manually dispose of objects when you are finished with + * them.

+ */ +public abstract class AbstractNativeReference implements AutoCloseable { + + /** + * Returns true if we are responsible for freeing the underlying C++ object + * + * @return true if we are responsible to free the C++ object + * @see #dispose() + */ + protected abstract boolean isOwningHandle(); + + /** + * Frees the underlying C++ object + *

+ * It is strong recommended that the developer calls this after they + * have finished using the object.

+ *

+ * Note, that once an instance of {@link AbstractNativeReference} has been + * disposed, calling any of its functions will lead to undefined + * behavior.

+ */ + @Override + public abstract void close(); + + /** + * @deprecated Instead use {@link AbstractNativeReference#close()} + */ + @Deprecated + public final void dispose() { + close(); + } + + /** + * Simply calls {@link AbstractNativeReference#dispose()} to free + * any underlying C++ object reference which has not yet been manually + * released. + * + * @deprecated You should not rely on GC of Rocks objects, and instead should + * either call {@link AbstractNativeReference#close()} manually or make + * use of some sort of ARM (Automatic Resource Management) such as + * Java 7's try-with-resources + * statement + */ + @Override + @Deprecated + protected void finalize() throws Throwable { + if(isOwningHandle()) { + //TODO(AR) log a warning message... developer should have called close() + } + dispose(); + super.finalize(); + } +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java index f3f89a6710..a1547b3b33 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,8 +25,7 @@ public abstract class AbstractRocksIterator

protected AbstractRocksIterator(final P parent, final long nativeHandle) { - super(); - nativeHandle_ = nativeHandle; + super(nativeHandle); // parent must point to a valid RocksDB instance. assert (parent != null); // RocksIterator must hold a reference to the related parent instance @@ -37,43 +36,43 @@ protected AbstractRocksIterator(final P parent, @Override public boolean isValid() { - assert (isInitialized()); + assert (isOwningHandle()); return isValid0(nativeHandle_); } @Override public void seekToFirst() { - assert (isInitialized()); + assert (isOwningHandle()); seekToFirst0(nativeHandle_); } @Override public void seekToLast() { - assert (isInitialized()); + assert (isOwningHandle()); seekToLast0(nativeHandle_); } @Override public void seek(byte[] target) { - assert (isInitialized()); + assert (isOwningHandle()); seek0(nativeHandle_, target, target.length); } @Override public void next() { - assert (isInitialized()); + assert (isOwningHandle()); next0(nativeHandle_); } @Override public void prev() { - assert (isInitialized()); + assert (isOwningHandle()); prev0(nativeHandle_); } @Override public void status() throws RocksDBException { - assert (isInitialized()); + assert (isOwningHandle()); status0(nativeHandle_); } @@ -87,15 +86,11 @@ public void status() throws RocksDBException { */ @Override protected void disposeInternal() { - synchronized (parent_) { - assert (isInitialized()); - if (parent_.isInitialized()) { + if (parent_.isOwningHandle()) { disposeInternal(nativeHandle_); } - } } - abstract void disposeInternal(long handle); abstract boolean isValid0(long handle); abstract void seekToFirst0(long handle); abstract void seekToLast0(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java index a37bd023ef..b6335a5f4e 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,7 +24,15 @@ * C++ BaseComparatorJniCallback subclass, which in turn destroys the * Java @see org.rocksdb.AbstractSlice subclass Objects. */ -abstract class AbstractSlice extends RocksObject { +public abstract class AbstractSlice extends RocksMutableObject { + + protected AbstractSlice() { + super(); + } + + protected AbstractSlice(final long nativeHandle) { + super(nativeHandle); + } /** * Returns the data of the slice. @@ -34,8 +42,7 @@ abstract class AbstractSlice extends RocksObject { * @see org.rocksdb.AbstractSlice#data0(long) */ public T data() { - assert (isInitialized()); - return data0(nativeHandle_); + return data0(getNativeHandle()); } /** @@ -56,8 +63,7 @@ public T data() { * @return The length in bytes. */ public int size() { - assert (isInitialized()); - return size0(nativeHandle_); + return size0(getNativeHandle()); } /** @@ -67,8 +73,7 @@ public int size() { * @return true if there is no data, false otherwise. */ public boolean empty() { - assert (isInitialized()); - return empty0(nativeHandle_); + return empty0(getNativeHandle()); } /** @@ -80,8 +85,7 @@ public boolean empty() { * @return The string representation of the data. */ public String toString(final boolean hex) { - assert (isInitialized()); - return toString0(nativeHandle_, hex); + return toString0(getNativeHandle(), hex); } @Override @@ -101,8 +105,15 @@ public String toString() { */ public int compare(final AbstractSlice other) { assert (other != null); - assert (isInitialized()); - return compare0(nativeHandle_, other.nativeHandle_); + if(!isOwningHandle()) { + return other.isOwningHandle() ? -1 : 0; + } else { + if(!other.isOwningHandle()) { + return 1; + } else { + return compare0(getNativeHandle(), other.getNativeHandle()); + } + } } @Override @@ -141,13 +152,19 @@ public boolean equals(final Object other) { */ public boolean startsWith(final AbstractSlice prefix) { if (prefix != null) { - assert (isInitialized()); - return startsWith0(nativeHandle_, prefix.nativeHandle_); + return startsWith0(getNativeHandle(), prefix.getNativeHandle()); } else { return false; } } + protected native static long createNewSliceFromString(final String str); + private native int size0(long handle); + private native boolean empty0(long handle); + private native String toString0(long handle, boolean hex); + private native int compare0(long handle, long otherHandle); + private native boolean startsWith0(long handle, long otherHandle); + /** * Deletes underlying C++ slice pointer. * Note that this function should be called only after all @@ -155,17 +172,6 @@ public boolean startsWith(final AbstractSlice prefix) { * Otherwise an undefined behavior will occur. */ @Override - protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - protected native void createNewSliceFromString(String str); - private native int size0(long handle); - private native boolean empty0(long handle); - private native String toString0(long handle, boolean hex); - private native int compare0(long handle, long otherHandle); - private native boolean startsWith0(long handle, long otherHandle); - private native void disposeInternal(long handle); + protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java index b380c5d8a7..cad7ebbd3a 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java @@ -1,92 +1,113 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. package org.rocksdb; -public abstract class AbstractWriteBatch extends RocksObject implements WriteBatchInterface { +public abstract class AbstractWriteBatch extends RocksObject + implements WriteBatchInterface { + + protected AbstractWriteBatch(final long nativeHandle) { + super(nativeHandle); + } @Override public int count() { - assert (isInitialized()); - return count0(); + assert (isOwningHandle()); + return count0(nativeHandle_); } @Override public void put(byte[] key, byte[] value) { - assert (isInitialized()); - put(key, key.length, value, value.length); + assert (isOwningHandle()); + put(nativeHandle_, key, key.length, value, value.length); } @Override - public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) { - assert (isInitialized()); - put(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); + public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, + byte[] value) { + assert (isOwningHandle()); + put(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); } @Override public void merge(byte[] key, byte[] value) { - assert (isInitialized()); - merge(key, key.length, value, value.length); + assert (isOwningHandle()); + merge(nativeHandle_, key, key.length, value, value.length); } @Override - public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) { - assert (isInitialized()); - merge(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); + public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, + byte[] value) { + assert (isOwningHandle()); + merge(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); } @Override public void remove(byte[] key) { - assert (isInitialized()); - remove(key, key.length); + assert (isOwningHandle()); + remove(nativeHandle_, key, key.length); } @Override public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) { - assert (isInitialized()); - remove(key, key.length, columnFamilyHandle.nativeHandle_); + assert (isOwningHandle()); + remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } @Override public void putLogData(byte[] blob) { - assert (isInitialized()); - putLogData(blob, blob.length); + assert (isOwningHandle()); + putLogData(nativeHandle_, blob, blob.length); } @Override public void clear() { - assert (isInitialized()); - clear0(); + assert (isOwningHandle()); + clear0(nativeHandle_); + } + + @Override + public void setSavePoint() { + assert (isOwningHandle()); + setSavePoint0(nativeHandle_); } - /** - * Delete the c++ side pointer. - */ @Override - protected void disposeInternal() { - assert (isInitialized()); - disposeInternal(nativeHandle_); + public void rollbackToSavePoint() throws RocksDBException { + assert (isOwningHandle()); + rollbackToSavePoint0(nativeHandle_); } - abstract void disposeInternal(long handle); + abstract int count0(final long handle); + + abstract void put(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen); - abstract int count0(); + abstract void put(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen, final long cfHandle); - abstract void put(byte[] key, int keyLen, byte[] value, int valueLen); + abstract void merge(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen); - abstract void put(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle); + abstract void merge(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen, final long cfHandle); - abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen); + abstract void remove(final long handle, final byte[] key, + final int keyLen); - abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle); + abstract void remove(final long handle, final byte[] key, + final int keyLen, final long cfHandle); - abstract void remove(byte[] key, int keyLen); + abstract void putLogData(final long handle, final byte[] blob, + final int blobLen); - abstract void remove(byte[] key, int keyLen, long cfHandle); + abstract void clear0(final long handle); - abstract void putLogData(byte[] blob, int blobLen); + abstract void setSavePoint0(final long handle); - abstract void clear0(); + abstract void rollbackToSavePoint0(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java b/external/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java index 2f944e5fb8..22f1d359e5 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,8 +19,8 @@ */ public class BackupEngine extends RocksObject implements AutoCloseable { - protected BackupEngine() { - super(); + protected BackupEngine(final long nativeHandle) { + super(nativeHandle); } /** @@ -30,12 +30,11 @@ protected BackupEngine() { * @param options Any options for the backup engine * * @return A new BackupEngine instance + * @throws RocksDBException thrown if the backup engine could not be opened */ public static BackupEngine open(final Env env, final BackupableDBOptions options) throws RocksDBException { - final BackupEngine be = new BackupEngine(); - be.open(env.nativeHandle_, options.nativeHandle_); - return be; + return new BackupEngine(open(env.nativeHandle_, options.nativeHandle_)); } /** @@ -47,6 +46,8 @@ public static BackupEngine open(final Env env, * @param db The database to backup * * Note - This method is not thread safe + * + * @throws RocksDBException thrown if a new backup could not be created */ public void createNewBackup(final RocksDB db) throws RocksDBException { createNewBackup(db, false); @@ -70,11 +71,13 @@ public void createNewBackup(final RocksDB db) throws RocksDBException { * parameter. * * Note - This method is not thread safe + * + * @throws RocksDBException thrown if a new backup could not be created */ public void createNewBackup( final RocksDB db, final boolean flushBeforeBackup) throws RocksDBException { - assert (isInitialized()); + assert (isOwningHandle()); createNewBackup(nativeHandle_, db.nativeHandle_, flushBeforeBackup); } @@ -85,7 +88,7 @@ public void createNewBackup( * @return A list of information about each available backup */ public List getBackupInfo() { - assert (isInitialized()); + assert (isOwningHandle()); return getBackupInfo(nativeHandle_); } @@ -97,7 +100,7 @@ public List getBackupInfo() { * @return array of backup ids as int ids. */ public int[] getCorruptedBackups() { - assert(isInitialized()); + assert(isOwningHandle()); return getCorruptedBackups(nativeHandle_); } @@ -110,7 +113,7 @@ public int[] getCorruptedBackups() { * native library. */ public void garbageCollect() throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); garbageCollect(nativeHandle_); } @@ -118,10 +121,12 @@ public void garbageCollect() throws RocksDBException { * Deletes old backups, keeping just the latest numBackupsToKeep * * @param numBackupsToKeep The latest n backups to keep + * + * @throws RocksDBException thrown if the old backups could not be deleted */ public void purgeOldBackups( final int numBackupsToKeep) throws RocksDBException { - assert (isInitialized()); + assert (isOwningHandle()); purgeOldBackups(nativeHandle_, numBackupsToKeep); } @@ -129,9 +134,11 @@ public void purgeOldBackups( * Deletes a backup * * @param backupId The id of the backup to delete + * + * @throws RocksDBException thrown if the backup could not be deleted */ public void deleteBackup(final int backupId) throws RocksDBException { - assert (isInitialized()); + assert (isOwningHandle()); deleteBackup(nativeHandle_, backupId); } @@ -154,11 +161,13 @@ public void deleteBackup(final int backupId) throws RocksDBException { * @param walDir The location of the log files for your database, * often the same as dbDir * @param restoreOptions Options for controlling the restore + * + * @throws RocksDBException thrown if the database could not be restored */ public void restoreDbFromBackup( final int backupId, final String dbDir, final String walDir, final RestoreOptions restoreOptions) throws RocksDBException { - assert (isInitialized()); + assert (isOwningHandle()); restoreDbFromBackup(nativeHandle_, backupId, dbDir, walDir, restoreOptions.nativeHandle_); } @@ -166,34 +175,24 @@ public void restoreDbFromBackup( /** * Restore the database from the latest backup * - * @param dbDir The directory to restore the backup to, i.e. where your database is - * @param walDir The location of the log files for your database, often the same as dbDir + * @param dbDir The directory to restore the backup to, i.e. where your + * database is + * @param walDir The location of the log files for your database, often the + * same as dbDir * @param restoreOptions Options for controlling the restore + * + * @throws RocksDBException thrown if the database could not be restored */ public void restoreDbFromLatestBackup( final String dbDir, final String walDir, final RestoreOptions restoreOptions) throws RocksDBException { - assert (isInitialized()); + assert (isOwningHandle()); restoreDbFromLatestBackup(nativeHandle_, dbDir, walDir, restoreOptions.nativeHandle_); } - /** - * Close the Backup Engine - */ - @Override - public void close() throws RocksDBException { - dispose(); - } - - @Override - protected void disposeInternal() { - assert (isInitialized()); - disposeInternal(nativeHandle_); - } - - private native void open(final long env, final long backupableDbOptions) - throws RocksDBException; + private native static long open(final long env, + final long backupableDbOptions) throws RocksDBException; private native void createNewBackup(final long handle, final long dbHandle, final boolean flushBeforeBackup) throws RocksDBException; @@ -218,5 +217,5 @@ private native void restoreDbFromLatestBackup(final long handle, final String dbDir, final String walDir, final long restoreOptionsHandle) throws RocksDBException; - private native void disposeInternal(final long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java b/external/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java index 48a52a789f..4f3a628458 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java b/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java index f2646d22a8..cebd69f674 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,8 +21,8 @@ public class BackupableDB extends RocksDB { * * @param opt {@link org.rocksdb.Options} to set for the database. * @param bopt {@link org.rocksdb.BackupableDBOptions} to use. - * @param db_path Path to store data to. The path for storing the backup should be - * specified in the {@link org.rocksdb.BackupableDBOptions}. + * @param db_path Path to store data to. The path for storing the backup + * should be specified in the {@link org.rocksdb.BackupableDBOptions}. * * @return {@link BackupableDB} reference to the opened database. * @@ -33,9 +33,9 @@ public static BackupableDB open( final Options opt, final BackupableDBOptions bopt, final String db_path) throws RocksDBException { - RocksDB db = RocksDB.open(opt, db_path); - BackupableDB bdb = new BackupableDB(); - bdb.open(db.nativeHandle_, bopt.nativeHandle_); + final RocksDB db = RocksDB.open(opt, db_path); + final BackupableDB bdb = new BackupableDB(open(db.nativeHandle_, + bopt.nativeHandle_)); // Prevent the RocksDB object from attempting to delete // the underly C++ DB object. @@ -56,7 +56,7 @@ public static BackupableDB open( */ public void createNewBackup(final boolean flushBeforeBackup) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); createNewBackup(nativeHandle_, flushBeforeBackup); } @@ -70,7 +70,7 @@ public void createNewBackup(final boolean flushBeforeBackup) */ public void purgeOldBackups(final int numBackupsToKeep) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); purgeOldBackups(nativeHandle_, numBackupsToKeep); } @@ -83,7 +83,7 @@ public void purgeOldBackups(final int numBackupsToKeep) * native library. */ public void deleteBackup(final int backupId) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); deleteBackup0(nativeHandle_, backupId); } @@ -94,7 +94,7 @@ public void deleteBackup(final int backupId) throws RocksDBException { * @return List of {@link BackupInfo} instances. */ public List getBackupInfos() { - assert(isInitialized()); + assert(isOwningHandle()); return getBackupInfo(nativeHandle_); } @@ -106,7 +106,7 @@ public List getBackupInfos() { * @return array of backup ids as int ids. */ public int[] getCorruptedBackups() { - assert(isInitialized()); + assert(isOwningHandle()); return getCorruptedBackups(nativeHandle_); } @@ -119,7 +119,7 @@ public int[] getCorruptedBackups() { * native library. */ public void garbageCollect() throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); garbageCollect(nativeHandle_); } @@ -132,19 +132,19 @@ public void garbageCollect() throws RocksDBException { * of the c++ {@code rocksdb::BackupableDB} and should be transparent * to Java developers.

*/ - @Override public synchronized void close() { - if (isInitialized()) { + @Override public void close() { super.close(); - } } /** *

A protected construction that will be used in the static * factory method {@link #open(Options, BackupableDBOptions, String)}. *

+ * + * @param nativeHandle The native handle of the C++ BackupableDB object */ - protected BackupableDB() { - super(); + protected BackupableDB(final long nativeHandle) { + super(nativeHandle); } @Override protected void finalize() throws Throwable { @@ -152,7 +152,8 @@ protected BackupableDB() { super.finalize(); } - protected native void open(long rocksDBHandle, long backupDBOptionsHandle); + protected native static long open(final long rocksDBHandle, + final long backupDBOptionsHandle); protected native void createNewBackup(long handle, boolean flag) throws RocksDBException; protected native void purgeOldBackups(long handle, int numBackupsToKeep) diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java index 17a0afc289..89591de82d 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,7 +6,6 @@ package org.rocksdb; import java.io.File; -import java.nio.file.Path; /** *

BackupableDBOptions to control the behavior of a backupable database. @@ -22,17 +21,22 @@ public class BackupableDBOptions extends RocksObject { /** *

BackupableDBOptions constructor.

* - * @param path Where to keep the backup files. Has to be different than db name. - * Best to set this to {@code db name_ + "/backups"} + * @param path Where to keep the backup files. Has to be different than db + * name. Best to set this to {@code db name_ + "/backups"} * @throws java.lang.IllegalArgumentException if illegal path is used. */ public BackupableDBOptions(final String path) { - super(); - File backupPath = path == null ? null : new File(path); - if (backupPath == null || !backupPath.isDirectory() || !backupPath.canWrite()) { + super(newBackupableDBOptions(ensureWritableFile(path))); + } + + private static String ensureWritableFile(final String path) { + final File backupPath = path == null ? null : new File(path); + if (backupPath == null || !backupPath.isDirectory() || + !backupPath.canWrite()) { throw new IllegalArgumentException("Illegal path provided."); + } else { + return path; } - newBackupableDBOptions(path); } /** @@ -41,24 +45,25 @@ public BackupableDBOptions(final String path) { * @return the path to the BackupableDB directory. */ public String backupDir() { - assert(isInitialized()); + assert(isOwningHandle()); return backupDir(nativeHandle_); } /** *

Share table files between backups.

* - * @param shareTableFiles If {@code share_table_files == true}, backup will assume - * that table files with same name have the same contents. This enables incremental - * backups and avoids unnecessary data copies. If {@code share_table_files == false}, - * each backup will be on its own and will not share any data with other backups. + * @param shareTableFiles If {@code share_table_files == true}, backup will + * assume that table files with same name have the same contents. This + * enables incremental backups and avoids unnecessary data copies. If + * {@code share_table_files == false}, each backup will be on its own and + * will not share any data with other backups. * *

Default: true

* * @return instance of current BackupableDBOptions. */ public BackupableDBOptions setShareTableFiles(final boolean shareTableFiles) { - assert(isInitialized()); + assert(isOwningHandle()); setShareTableFiles(nativeHandle_, shareTableFiles); return this; } @@ -70,24 +75,24 @@ public BackupableDBOptions setShareTableFiles(final boolean shareTableFiles) { * backups. */ public boolean shareTableFiles() { - assert(isInitialized()); + assert(isOwningHandle()); return shareTableFiles(nativeHandle_); } /** *

Set synchronous backups.

* - * @param sync If {@code sync == true}, we can guarantee you'll get consistent backup - * even on a machine crash/reboot. Backup process is slower with sync enabled. - * If {@code sync == false}, we don't guarantee anything on machine reboot. - * However,chances are some of the backups are consistent. + * @param sync If {@code sync == true}, we can guarantee you'll get consistent + * backup even on a machine crash/reboot. Backup process is slower with sync + * enabled. If {@code sync == false}, we don't guarantee anything on machine + * reboot. However, chances are some of the backups are consistent. * *

Default: true

* * @return instance of current BackupableDBOptions. */ public BackupableDBOptions setSync(final boolean sync) { - assert(isInitialized()); + assert(isOwningHandle()); setSync(nativeHandle_, sync); return this; } @@ -98,21 +103,22 @@ public BackupableDBOptions setSync(final boolean sync) { * @return boolean value if synchronous backups are configured. */ public boolean sync() { - assert(isInitialized()); + assert(isOwningHandle()); return sync(nativeHandle_); } /** *

Set if old data will be destroyed.

* - * @param destroyOldData If true, it will delete whatever backups there are already. + * @param destroyOldData If true, it will delete whatever backups there are + * already. * *

Default: false

* * @return instance of current BackupableDBOptions. */ public BackupableDBOptions setDestroyOldData(final boolean destroyOldData) { - assert(isInitialized()); + assert(isOwningHandle()); setDestroyOldData(nativeHandle_, destroyOldData); return this; } @@ -123,23 +129,23 @@ public BackupableDBOptions setDestroyOldData(final boolean destroyOldData) { * @return boolean value indicating if old data will be destroyed. */ public boolean destroyOldData() { - assert(isInitialized()); + assert(isOwningHandle()); return destroyOldData(nativeHandle_); } /** *

Set if log files shall be persisted.

* - * @param backupLogFiles If false, we won't backup log files. This option can be - * useful for backing up in-memory databases where log file are persisted,but table - * files are in memory. + * @param backupLogFiles If false, we won't backup log files. This option can + * be useful for backing up in-memory databases where log file are + * persisted, but table files are in memory. * *

Default: true

* * @return instance of current BackupableDBOptions. */ public BackupableDBOptions setBackupLogFiles(final boolean backupLogFiles) { - assert(isInitialized()); + assert(isOwningHandle()); setBackupLogFiles(nativeHandle_, backupLogFiles); return this; } @@ -150,73 +156,76 @@ public BackupableDBOptions setBackupLogFiles(final boolean backupLogFiles) { * @return boolean value indicating if log files will be persisted. */ public boolean backupLogFiles() { - assert(isInitialized()); + assert(isOwningHandle()); return backupLogFiles(nativeHandle_); } /** *

Set backup rate limit.

* - * @param backupRateLimit Max bytes that can be transferred in a second during backup. - * If 0 or negative, then go as fast as you can. + * @param backupRateLimit Max bytes that can be transferred in a second during + * backup. If 0 or negative, then go as fast as you can. * *

Default: 0

* * @return instance of current BackupableDBOptions. */ public BackupableDBOptions setBackupRateLimit(long backupRateLimit) { - assert(isInitialized()); + assert(isOwningHandle()); backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit; setBackupRateLimit(nativeHandle_, backupRateLimit); return this; } /** - *

Return backup rate limit which described the max bytes that can be transferred in a - * second during backup.

+ *

Return backup rate limit which described the max bytes that can be + * transferred in a second during backup.

* - * @return numerical value describing the backup transfer limit in bytes per second. + * @return numerical value describing the backup transfer limit in bytes per + * second. */ public long backupRateLimit() { - assert(isInitialized()); + assert(isOwningHandle()); return backupRateLimit(nativeHandle_); } /** *

Set restore rate limit.

* - * @param restoreRateLimit Max bytes that can be transferred in a second during restore. - * If 0 or negative, then go as fast as you can. + * @param restoreRateLimit Max bytes that can be transferred in a second + * during restore. If 0 or negative, then go as fast as you can. * *

Default: 0

* * @return instance of current BackupableDBOptions. */ public BackupableDBOptions setRestoreRateLimit(long restoreRateLimit) { - assert(isInitialized()); + assert(isOwningHandle()); restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit; setRestoreRateLimit(nativeHandle_, restoreRateLimit); return this; } /** - *

Return restore rate limit which described the max bytes that can be transferred in a - * second during restore.

+ *

Return restore rate limit which described the max bytes that can be + * transferred in a second during restore.

* - * @return numerical value describing the restore transfer limit in bytes per second. + * @return numerical value describing the restore transfer limit in bytes per + * second. */ public long restoreRateLimit() { - assert(isInitialized()); + assert(isOwningHandle()); return restoreRateLimit(nativeHandle_); } /** - *

Only used if share_table_files is set to true. If true, will consider that - * backups can come from different databases, hence a sst is not uniquely - * identified by its name, but by the triple (file name, crc32, file length)

+ *

Only used if share_table_files is set to true. If true, will consider + * that backups can come from different databases, hence a sst is not uniquely + * identified by its name, but by the triple (file name, crc32, file length) + *

* - * @param shareFilesWithChecksum boolean value indicating if SST files are stored - * using the triple (file name, crc32, file length) and not its name. + * @param shareFilesWithChecksum boolean value indicating if SST files are + * stored using the triple (file name, crc32, file length) and not its name. * *

Note: this is an experimental option, and you'll need to set it manually * turn it on only if you know what you're doing*

@@ -227,7 +236,7 @@ public long restoreRateLimit() { */ public BackupableDBOptions setShareFilesWithChecksum( final boolean shareFilesWithChecksum) { - assert(isInitialized()); + assert(isOwningHandle()); setShareFilesWithChecksum(nativeHandle_, shareFilesWithChecksum); return this; } @@ -239,19 +248,11 @@ public BackupableDBOptions setShareFilesWithChecksum( * is active. */ public boolean shareFilesWithChecksum() { - assert(isInitialized()); + assert(isOwningHandle()); return shareFilesWithChecksum(nativeHandle_); } - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override protected void disposeInternal() { - disposeInternal(nativeHandle_); - } - - private native void newBackupableDBOptions(String path); + private native static long newBackupableDBOptions(final String path); private native String backupDir(long handle); private native void setShareTableFiles(long handle, boolean flag); private native boolean shareTableFiles(long handle); @@ -267,5 +268,5 @@ public boolean shareFilesWithChecksum() { private native long restoreRateLimit(long handle); private native void setShareFilesWithChecksum(long handle, boolean flag); private native boolean shareFilesWithChecksum(long handle); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/external/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index c3c6309b3d..050eff1c89 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,6 +21,7 @@ public BlockBasedTableConfig() { wholeKeyFiltering_ = true; filter_ = null; cacheIndexAndFilterBlocks_ = false; + pinL0FilterAndIndexBlocksInCache_ = false; hashIndexAllowCollision_ = true; blockCacheCompressedSize_ = 0; blockCacheCompressedNumShardBits_ = 0; @@ -226,6 +227,29 @@ public BlockBasedTableConfig setCacheIndexAndFilterBlocks( return this; } + /** + * Indicating if we'd like to pin L0 index/filter blocks to the block cache. + If not specified, defaults to false. + * + * @return if L0 index and filter blocks should be pinned to the block cache. + */ + public boolean pinL0FilterAndIndexBlocksInCache() { + return pinL0FilterAndIndexBlocksInCache_; + } + + /** + * Indicating if we'd like to pin L0 index/filter blocks to the block cache. + If not specified, defaults to false. + * + * @param pinL0FilterAndIndexBlocksInCache pin blocks in block cache + * @return the reference to the current config. + */ + public BlockBasedTableConfig setPinL0FilterAndIndexBlocksInCache( + final boolean pinL0FilterAndIndexBlocksInCache) { + pinL0FilterAndIndexBlocksInCache_ = pinL0FilterAndIndexBlocksInCache; + return this; + } + /** * Influence the behavior when kHashSearch is used. if false, stores a precise prefix to block range mapping @@ -393,6 +417,7 @@ public int formatVersion() { blockCacheNumShardBits_, blockSize_, blockSizeDeviation_, blockRestartInterval_, wholeKeyFiltering_, filterHandle, cacheIndexAndFilterBlocks_, + pinL0FilterAndIndexBlocksInCache_, hashIndexAllowCollision_, blockCacheCompressedSize_, blockCacheCompressedNumShardBits_, checksumType_.getValue(), indexType_.getValue(), @@ -403,11 +428,13 @@ private native long newTableFactoryHandle( boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits, long blockSize, int blockSizeDeviation, int blockRestartInterval, boolean wholeKeyFiltering, long filterPolicyHandle, - boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision, - long blockCacheCompressedSize, int blockCacheCompressedNumShardBits, - byte checkSumType, byte indexType, int formatVersion); + boolean cacheIndexAndFilterBlocks, boolean pinL0FilterAndIndexBlocksInCache, + boolean hashIndexAllowCollision, long blockCacheCompressedSize, + int blockCacheCompressedNumShardBits, byte checkSumType, + byte indexType, int formatVersion); private boolean cacheIndexAndFilterBlocks_; + private boolean pinL0FilterAndIndexBlocksInCache_; private IndexType indexType_; private boolean hashIndexAllowCollision_; private ChecksumType checksumType_; diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java b/external/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java index 67c45d7171..a8c2f7e7f9 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,8 +22,6 @@ public class BloomFilter extends Filter { private static final int DEFAULT_BITS_PER_KEY = 10; private static final boolean DEFAULT_MODE = true; - private final int bitsPerKey_; - private final boolean useBlockBasedMode_; /** * BloomFilter constructor @@ -73,17 +71,9 @@ public BloomFilter(final int bitsPerKey) { * @param useBlockBasedMode use block based mode or full filter mode */ public BloomFilter(final int bitsPerKey, final boolean useBlockBasedMode) { - super(); - bitsPerKey_ = bitsPerKey; - useBlockBasedMode_ = useBlockBasedMode; - createNewFilter(); + super(createNewBloomFilter(bitsPerKey, useBlockBasedMode)); } - @Override - protected final void createNewFilter() { - createNewBloomFilter(bitsPerKey_, useBlockBasedMode_); - } - - private native void createNewBloomFilter(int bitsKeyKey, - boolean useBlockBasedMode); + private native static long createNewBloomFilter(final int bitsKeyKey, + final boolean useBlockBasedMode); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java index ee92e8dd93..436cb513f1 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java b/external/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java index 816eceacfd..d867227784 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -27,7 +27,7 @@ public static Checkpoint create(final RocksDB db) { if (db == null) { throw new IllegalArgumentException( "RocksDB instance shall not be null."); - } else if (!db.isInitialized()) { + } else if (!db.isOwningHandle()) { throw new IllegalStateException( "RocksDB instance must be initialized."); } @@ -51,21 +51,15 @@ public void createCheckpoint(final String checkpointPath) createCheckpoint(nativeHandle_, checkpointPath); } - @Override - protected void disposeInternal() { - disposeInternal(nativeHandle_); + private Checkpoint(final RocksDB db) { + super(newCheckpoint(db.nativeHandle_)); + this.db_ = db; } - private Checkpoint(RocksDB db) { - super(); - nativeHandle_ = newCheckpoint(db.nativeHandle_); - db_ = db; - } - - private RocksDB db_; + private final RocksDB db_; private static native long newCheckpoint(long dbHandle); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); private native void createCheckpoint(long handle, String checkpointPath) throws RocksDBException; diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java b/external/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java index e685376bfd..7f560170c0 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java index 8def05e743..84581f465c 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java index 613cb892c6..6aa22d3fea 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,34 +12,31 @@ public class ColumnFamilyHandle extends RocksObject { ColumnFamilyHandle(final RocksDB rocksDB, final long nativeHandle) { - super(); - nativeHandle_ = nativeHandle; + super(nativeHandle); // rocksDB must point to a valid RocksDB instance; assert(rocksDB != null); // ColumnFamilyHandle must hold a reference to the related RocksDB instance // to guarantee that while a GC cycle starts ColumnFamilyHandle instances // are freed prior to RocksDB instances. - rocksDB_ = rocksDB; + this.rocksDB_ = rocksDB; } /** *

Deletes underlying C++ iterator pointer.

* *

Note: the underlying handle can only be safely deleted if the RocksDB - * instance related to a certain ColumnFamilyHandle is still valid and initialized. - * Therefore {@code disposeInternal()} checks if the RocksDB is initialized - * before freeing the native handle.

+ * instance related to a certain ColumnFamilyHandle is still valid and + * initialized. Therefore {@code disposeInternal()} checks if the RocksDB is + * initialized before freeing the native handle.

*/ - @Override protected void disposeInternal() { - synchronized (rocksDB_) { - assert (isInitialized()); - if (rocksDB_.isInitialized()) { - disposeInternal(nativeHandle_); - } + @Override + protected void disposeInternal() { + if(rocksDB_.isOwningHandle()) { + disposeInternal(nativeHandle_); } } - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); private final RocksDB rocksDB_; } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index 4304f589a8..528e0f2053 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,8 +13,8 @@ * ColumnFamilyOptions to control the behavior of a database. It will be used * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). * - * If {@link #dispose()} function is not called, then it will be GC'd automatically - * and native resources will be released as part of the process. + * If {@link #dispose()} function is not called, then it will be GC'd + * automatically and native resources will be released as part of the process. */ public class ColumnFamilyOptions extends RocksObject implements ColumnFamilyOptionsInterface { @@ -29,8 +29,7 @@ public class ColumnFamilyOptions extends RocksObject * an {@code rocksdb::DBOptions} in the c++ side. */ public ColumnFamilyOptions() { - super(); - newColumnFamilyOptions(); + super(newColumnFamilyOptions()); } /** @@ -113,8 +112,9 @@ public ColumnFamilyOptions optimizeUniversalStyleCompaction( } @Override - public ColumnFamilyOptions setComparator(final BuiltinComparator builtinComparator) { - assert(isInitialized()); + public ColumnFamilyOptions setComparator( + final BuiltinComparator builtinComparator) { + assert(isOwningHandle()); setComparatorHandle(nativeHandle_, builtinComparator.ordinal()); return this; } @@ -122,15 +122,15 @@ public ColumnFamilyOptions setComparator(final BuiltinComparator builtinComparat @Override public ColumnFamilyOptions setComparator( final AbstractComparator> comparator) { - assert (isInitialized()); - setComparatorHandle(nativeHandle_, comparator.nativeHandle_); + assert (isOwningHandle()); + setComparatorHandle(nativeHandle_, comparator.getNativeHandle()); comparator_ = comparator; return this; } @Override public ColumnFamilyOptions setMergeOperatorName(final String name) { - assert (isInitialized()); + assert (isOwningHandle()); if (name == null) { throw new IllegalArgumentException( "Merge operator name must not be null."); @@ -140,13 +140,15 @@ public ColumnFamilyOptions setMergeOperatorName(final String name) { } @Override - public ColumnFamilyOptions setMergeOperator(final MergeOperator mergeOperator) { + public ColumnFamilyOptions setMergeOperator( + final MergeOperator mergeOperator) { setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle()); return this; } public ColumnFamilyOptions setCompactionFilter( - final AbstractCompactionFilter> compactionFilter) { + final AbstractCompactionFilter> + compactionFilter) { setCompactionFilterHandle(nativeHandle_, compactionFilter.nativeHandle_); compactionFilter_ = compactionFilter; return this; @@ -154,28 +156,28 @@ public ColumnFamilyOptions setCompactionFilter( @Override public ColumnFamilyOptions setWriteBufferSize(final long writeBufferSize) { - assert(isInitialized()); + assert(isOwningHandle()); setWriteBufferSize(nativeHandle_, writeBufferSize); return this; } @Override public long writeBufferSize() { - assert(isInitialized()); + assert(isOwningHandle()); return writeBufferSize(nativeHandle_); } @Override public ColumnFamilyOptions setMaxWriteBufferNumber( final int maxWriteBufferNumber) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber); return this; } @Override public int maxWriteBufferNumber() { - assert(isInitialized()); + assert(isOwningHandle()); return maxWriteBufferNumber(nativeHandle_); } @@ -193,20 +195,21 @@ public int minWriteBufferNumberToMerge() { @Override public ColumnFamilyOptions useFixedLengthPrefixExtractor(final int n) { - assert(isInitialized()); + assert(isOwningHandle()); useFixedLengthPrefixExtractor(nativeHandle_, n); return this; } @Override public ColumnFamilyOptions useCappedPrefixExtractor(final int n) { - assert(isInitialized()); + assert(isOwningHandle()); useCappedPrefixExtractor(nativeHandle_, n); return this; } @Override - public ColumnFamilyOptions setCompressionType(final CompressionType compressionType) { + public ColumnFamilyOptions setCompressionType( + final CompressionType compressionType) { setCompressionType(nativeHandle_, compressionType.getValue()); return this; } @@ -219,10 +222,10 @@ public CompressionType compressionType() { @Override public ColumnFamilyOptions setCompressionPerLevel( final List compressionLevels) { - final List byteCompressionTypes = new ArrayList<>( - compressionLevels.size()); - for (final CompressionType compressionLevel : compressionLevels) { - byteCompressionTypes.add(compressionLevel.getValue()); + final byte[] byteCompressionTypes = new byte[ + compressionLevels.size()]; + for (int i = 0; i < compressionLevels.size(); i++) { + byteCompressionTypes[i] = compressionLevels.get(i).getValue(); } setCompressionPerLevel(nativeHandle_, byteCompressionTypes); return this; @@ -230,7 +233,7 @@ public ColumnFamilyOptions setCompressionPerLevel( @Override public List compressionPerLevel() { - final List byteCompressionTypes = + final byte[] byteCompressionTypes = compressionPerLevel(nativeHandle_); final List compressionLevels = new ArrayList<>(); for (final Byte byteCompressionType : byteCompressionTypes) { @@ -485,7 +488,7 @@ public CompactionStyle compactionStyle() { public ColumnFamilyOptions setMaxTableFilesSizeFIFO( final long maxTableFilesSize) { assert(maxTableFilesSize > 0); // unsigned native type - assert(isInitialized()); + assert(isOwningHandle()); setMaxTableFilesSizeFIFO(nativeHandle_, maxTableFilesSize); return this; } @@ -508,22 +511,11 @@ public boolean verifyChecksumsInCompaction() { return verifyChecksumsInCompaction(nativeHandle_); } - @Override - public ColumnFamilyOptions setFilterDeletes( - final boolean filterDeletes) { - setFilterDeletes(nativeHandle_, filterDeletes); - return this; - } - - @Override - public boolean filterDeletes() { - return filterDeletes(nativeHandle_); - } - @Override public ColumnFamilyOptions setMaxSequentialSkipInIterations( final long maxSequentialSkipInIterations) { - setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); + setMaxSequentialSkipInIterations(nativeHandle_, + maxSequentialSkipInIterations); return this; } @@ -542,7 +534,7 @@ public ColumnFamilyOptions setMemTableConfig( @Override public String memTableFactoryName() { - assert(isInitialized()); + assert(isOwningHandle()); return memTableFactoryName(nativeHandle_); } @@ -556,7 +548,7 @@ public ColumnFamilyOptions setTableFormatConfig( @Override public String tableFactoryName() { - assert(isInitialized()); + assert(isOwningHandle()); return tableFactoryName(nativeHandle_); } @@ -585,27 +577,15 @@ public long inplaceUpdateNumLocks() { } @Override - public ColumnFamilyOptions setMemtablePrefixBloomBits( - final int memtablePrefixBloomBits) { - setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); + public ColumnFamilyOptions setMemtablePrefixBloomSizeRatio( + final double memtablePrefixBloomSizeRatio) { + setMemtablePrefixBloomSizeRatio(nativeHandle_, memtablePrefixBloomSizeRatio); return this; } @Override - public int memtablePrefixBloomBits() { - return memtablePrefixBloomBits(nativeHandle_); - } - - @Override - public ColumnFamilyOptions setMemtablePrefixBloomProbes( - final int memtablePrefixBloomProbes) { - setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); - return this; - } - - @Override - public int memtablePrefixBloomProbes() { - return memtablePrefixBloomProbes(nativeHandle_); + public double memtablePrefixBloomSizeRatio() { + return memtablePrefixBloomSizeRatio(nativeHandle_); } @Override @@ -655,15 +635,6 @@ public boolean optimizeFiltersForHits() { return optimizeFiltersForHits(nativeHandle_); } - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - /** *

Private constructor to be used by * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}

@@ -671,15 +642,14 @@ public boolean optimizeFiltersForHits() { * @param handle native handle to ColumnFamilyOptions instance. */ private ColumnFamilyOptions(final long handle) { - super(); - nativeHandle_ = handle; + super(handle); } private static native long getColumnFamilyOptionsFromProps( String optString); - private native void newColumnFamilyOptions(); - private native void disposeInternal(long handle); + private static native long newColumnFamilyOptions(); + @Override protected final native void disposeInternal(final long handle); private native void optimizeForPointLookup(long handle, long blockCacheSizeMb); @@ -688,12 +658,12 @@ private native void optimizeLevelStyleCompaction(long handle, private native void optimizeUniversalStyleCompaction(long handle, long memtableMemoryBudget); private native void setComparatorHandle(long handle, int builtinComparator); - private native void setComparatorHandle(long optHandle, long comparatorHandle); - private native void setMergeOperatorName( - long handle, String name); - private native void setMergeOperator( - long handle, long mergeOperatorHandle); - private native void setCompactionFilterHandle(long handle, long compactionFilterHandle); + private native void setComparatorHandle(long optHandle, + long comparatorHandle); + private native void setMergeOperatorName(long handle, String name); + private native void setMergeOperator(long handle, long mergeOperatorHandle); + private native void setCompactionFilterHandle(long handle, + long compactionFilterHandle); private native void setWriteBufferSize(long handle, long writeBufferSize) throws IllegalArgumentException; private native long writeBufferSize(long handle); @@ -706,8 +676,8 @@ private native void setMinWriteBufferNumberToMerge( private native void setCompressionType(long handle, byte compressionType); private native byte compressionType(long handle); private native void setCompressionPerLevel(long handle, - List compressionLevels); - private native List compressionPerLevel(long handle); + byte[] compressionLevels); + private native byte[] compressionPerLevel(long handle); private native void useFixedLengthPrefixExtractor( long handle, int prefixLength); private native void useCappedPrefixExtractor( @@ -776,9 +746,6 @@ private native void setPurgeRedundantKvsWhileFlush( private native void setVerifyChecksumsInCompaction( long handle, boolean verifyChecksumsInCompaction); private native boolean verifyChecksumsInCompaction(long handle); - private native void setFilterDeletes( - long handle, boolean filterDeletes); - private native boolean filterDeletes(long handle); private native void setMaxSequentialSkipInIterations( long handle, long maxSequentialSkipInIterations); private native long maxSequentialSkipInIterations(long handle); @@ -793,12 +760,9 @@ private native void setInplaceUpdateNumLocks( long handle, long inplaceUpdateNumLocks) throws IllegalArgumentException; private native long inplaceUpdateNumLocks(long handle); - private native void setMemtablePrefixBloomBits( - long handle, int memtablePrefixBloomBits); - private native int memtablePrefixBloomBits(long handle); - private native void setMemtablePrefixBloomProbes( - long handle, int memtablePrefixBloomProbes); - private native int memtablePrefixBloomProbes(long handle); + private native void setMemtablePrefixBloomSizeRatio( + long handle, double memtablePrefixBloomSizeRatio); + private native double memtablePrefixBloomSizeRatio(long handle); private native void setBloomLocality( long handle, int bloomLocality); private native int bloomLocality(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java index 1c7a5a110a..dea3d1b9f9 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -231,7 +231,7 @@ Object setMinWriteBufferNumberToMerge( /** - * Same as fixed length prefix extractor, except that when slice is + * Same as fixed length prefix extractor, except that when slice is * shorter than the fixed length, it will use the full key. * * @param n use the first n bytes of a key as its prefix. @@ -753,7 +753,7 @@ Object setRateLimitDelayMaxMilliseconds( * If ≤ 0, a proper value is automatically calculated (usually 1/10 of * writer_buffer_size). * - * There are two additonal restriction of the The specified size: + * There are two additional restriction of the The specified size: * (1) size should be in the range of [4096, 2 << 30] and * (2) be the multiple of the CPU word (which helps with the memory * alignment). @@ -774,7 +774,7 @@ Object setRateLimitDelayMaxMilliseconds( * If ≤ 0, a proper value is automatically calculated (usually 1/10 of * writer_buffer_size). * - * There are two additonal restriction of the The specified size: + * There are two additional restriction of the The specified size: * (1) size should be in the range of [4096, 2 << 30] and * (2) be the multiple of the CPU word (which helps with the memory * alignment). @@ -881,29 +881,6 @@ Object setVerifyChecksumsInCompaction( */ boolean verifyChecksumsInCompaction(); - /** - * Use KeyMayExist API to filter deletes when this is true. - * If KeyMayExist returns false, i.e. the key definitely does not exist, then - * the delete is a noop. KeyMayExist only incurs in-memory look up. - * This optimization avoids writing the delete to storage when appropriate. - * Default: false - * - * @param filterDeletes true if filter-deletes behavior is on. - * @return the reference to the current option. - */ - Object setFilterDeletes(boolean filterDeletes); - - /** - * Use KeyMayExist API to filter deletes when this is true. - * If KeyMayExist returns false, i.e. the key definitely does not exist, then - * the delete is a noop. KeyMayExist only incurs in-memory look up. - * This optimization avoids writing the delete to storage when appropriate. - * Default: false - * - * @return true if filter-deletes behavior is on. - */ - boolean filterDeletes(); - /** * An iteration->Next() sequentially skips over keys with the same * user-key unless this option is set. This number specifies the number @@ -1011,15 +988,15 @@ Object setVerifyChecksumsInCompaction( long inplaceUpdateNumLocks(); /** - * Sets the number of bits used in the prefix bloom filter. + * Sets the size ratio of the memtable used in the prefix bloom filter. * * This value will be used only when a prefix-extractor is specified. * - * @param memtablePrefixBloomBits the number of bits used in the + * @param memtablePrefixBloomSizeRatio the number of bits used in the * prefix bloom filter. * @return the reference to the current option. */ - Object setMemtablePrefixBloomBits(int memtablePrefixBloomBits); + Object setMemtablePrefixBloomSizeRatio(double memtablePrefixBloomSizeRatio); /** * Returns the number of bits used in the prefix bloom filter. @@ -1029,22 +1006,7 @@ Object setVerifyChecksumsInCompaction( * @return the number of bloom-bits. * @see #useFixedLengthPrefixExtractor(int) */ - int memtablePrefixBloomBits(); - - /** - * The number of hash probes per key used in the mem-table. - * - * @param memtablePrefixBloomProbes the number of hash probes per key. - * @return the reference to the current option. - */ - Object setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes); - - /** - * The number of hash probes per key used in the mem-table. - * - * @return the number of hash probes per key. - */ - int memtablePrefixBloomProbes(); + double memtablePrefixBloomSizeRatio(); /** * Control locality of bloom filter probes to improve cache miss rate. @@ -1074,7 +1036,7 @@ Object setVerifyChecksumsInCompaction( * Default: 0 * * @return the level of locality of bloom-filter probes. - * @see #setMemtablePrefixBloomProbes(int) + * @see #setBloomLocality(int) */ int bloomLocality(); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java b/external/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java index 76064395cb..22dc7dcf5f 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Comparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/Comparator.java index c8e050bca8..009f2e51f4 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Comparator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Comparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,10 +15,18 @@ * using @see org.rocksdb.DirectComparator */ public abstract class Comparator extends AbstractComparator { + + private final long nativeHandle_; + public Comparator(final ComparatorOptions copt) { super(); - createNewComparator0(copt.nativeHandle_); + this.nativeHandle_ = createNewComparator0(copt.nativeHandle_); + } + + @Override + protected final long getNativeHandle() { + return nativeHandle_; } - private native void createNewComparator0(final long comparatorOptionsHandle); + private native long createNewComparator0(final long comparatorOptionsHandle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java index f0ba520a3c..3a05befa44 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java @@ -10,8 +10,7 @@ */ public class ComparatorOptions extends RocksObject { public ComparatorOptions() { - super(); - newComparatorOptions(); + super(newComparatorOptions()); } /** @@ -24,7 +23,7 @@ public ComparatorOptions() { * @return true if adaptive mutex is used. */ public boolean useAdaptiveMutex() { - assert(isInitialized()); + assert(isOwningHandle()); return useAdaptiveMutex(nativeHandle_); } @@ -39,19 +38,14 @@ public boolean useAdaptiveMutex() { * @return the reference to the current comparator options. */ public ComparatorOptions setUseAdaptiveMutex(final boolean useAdaptiveMutex) { - assert (isInitialized()); + assert (isOwningHandle()); setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); return this; } - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - private native void newComparatorOptions(); + private native static long newComparatorOptions(); private native boolean useAdaptiveMutex(final long handle); private native void setUseAdaptiveMutex(final long handle, final boolean useAdaptiveMutex); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java b/external/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java index ec0c42f4d0..b4d86166e5 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java index 85aad1e722..878dd4d70f 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,8 +11,8 @@ * DBOptions to control the behavior of a database. It will be used * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). * - * If {@link #dispose()} function is not called, then it will be GC'd automatically - * and native resources will be released as part of the process. + * If {@link #dispose()} function is not called, then it will be GC'd + * automatically and native resources will be released as part of the process. */ public class DBOptions extends RocksObject implements DBOptionsInterface { static { @@ -26,9 +26,8 @@ public class DBOptions extends RocksObject implements DBOptionsInterface { * an {@code rocksdb::DBOptions} in the c++ side. */ public DBOptions() { - super(); + super(newDBOptions()); numShardBits_ = DEFAULT_NUM_SHARD_BITS; - newDBOptions(); } /** @@ -75,70 +74,70 @@ public static DBOptions getDBOptionsFromProps( @Override public DBOptions setIncreaseParallelism( final int totalThreads) { - assert (isInitialized()); + assert(isOwningHandle()); setIncreaseParallelism(nativeHandle_, totalThreads); return this; } @Override public DBOptions setCreateIfMissing(final boolean flag) { - assert(isInitialized()); + assert(isOwningHandle()); setCreateIfMissing(nativeHandle_, flag); return this; } @Override public boolean createIfMissing() { - assert(isInitialized()); + assert(isOwningHandle()); return createIfMissing(nativeHandle_); } @Override public DBOptions setCreateMissingColumnFamilies( final boolean flag) { - assert(isInitialized()); + assert(isOwningHandle()); setCreateMissingColumnFamilies(nativeHandle_, flag); return this; } @Override public boolean createMissingColumnFamilies() { - assert(isInitialized()); + assert(isOwningHandle()); return createMissingColumnFamilies(nativeHandle_); } @Override public DBOptions setErrorIfExists( final boolean errorIfExists) { - assert(isInitialized()); + assert(isOwningHandle()); setErrorIfExists(nativeHandle_, errorIfExists); return this; } @Override public boolean errorIfExists() { - assert(isInitialized()); + assert(isOwningHandle()); return errorIfExists(nativeHandle_); } @Override public DBOptions setParanoidChecks( final boolean paranoidChecks) { - assert(isInitialized()); + assert(isOwningHandle()); setParanoidChecks(nativeHandle_, paranoidChecks); return this; } @Override public boolean paranoidChecks() { - assert(isInitialized()); + assert(isOwningHandle()); return paranoidChecks(nativeHandle_); } @Override public DBOptions setRateLimiterConfig( final RateLimiterConfig config) { - assert(isInitialized()); + assert(isOwningHandle()); rateLimiterConfig_ = config; setRateLimiter(nativeHandle_, config.newRateLimiterHandle()); return this; @@ -146,7 +145,7 @@ public DBOptions setRateLimiterConfig( @Override public DBOptions setLogger(final Logger logger) { - assert(isInitialized()); + assert(isOwningHandle()); setLogger(nativeHandle_, logger.nativeHandle_); return this; } @@ -154,14 +153,14 @@ public DBOptions setLogger(final Logger logger) { @Override public DBOptions setInfoLogLevel( final InfoLogLevel infoLogLevel) { - assert(isInitialized()); + assert(isOwningHandle()); setInfoLogLevel(nativeHandle_, infoLogLevel.getValue()); return this; } @Override public InfoLogLevel infoLogLevel() { - assert(isInitialized()); + assert(isOwningHandle()); return InfoLogLevel.getInfoLogLevel( infoLogLevel(nativeHandle_)); } @@ -169,41 +168,41 @@ public InfoLogLevel infoLogLevel() { @Override public DBOptions setMaxOpenFiles( final int maxOpenFiles) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxOpenFiles(nativeHandle_, maxOpenFiles); return this; } @Override public int maxOpenFiles() { - assert(isInitialized()); + assert(isOwningHandle()); return maxOpenFiles(nativeHandle_); } @Override public DBOptions setMaxTotalWalSize( final long maxTotalWalSize) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxTotalWalSize(nativeHandle_, maxTotalWalSize); return this; } @Override public long maxTotalWalSize() { - assert(isInitialized()); + assert(isOwningHandle()); return maxTotalWalSize(nativeHandle_); } @Override public DBOptions createStatistics() { - assert(isInitialized()); + assert(isOwningHandle()); createStatistics(nativeHandle_); return this; } @Override public Statistics statisticsPtr() { - assert(isInitialized()); + assert(isOwningHandle()); long statsPtr = statisticsPtr(nativeHandle_); if(statsPtr == 0) { @@ -217,287 +216,287 @@ public Statistics statisticsPtr() { @Override public DBOptions setDisableDataSync( final boolean disableDataSync) { - assert(isInitialized()); + assert(isOwningHandle()); setDisableDataSync(nativeHandle_, disableDataSync); return this; } @Override public boolean disableDataSync() { - assert(isInitialized()); + assert(isOwningHandle()); return disableDataSync(nativeHandle_); } @Override public DBOptions setUseFsync( final boolean useFsync) { - assert(isInitialized()); + assert(isOwningHandle()); setUseFsync(nativeHandle_, useFsync); return this; } @Override public boolean useFsync() { - assert(isInitialized()); + assert(isOwningHandle()); return useFsync(nativeHandle_); } @Override public DBOptions setDbLogDir( final String dbLogDir) { - assert(isInitialized()); + assert(isOwningHandle()); setDbLogDir(nativeHandle_, dbLogDir); return this; } @Override public String dbLogDir() { - assert(isInitialized()); + assert(isOwningHandle()); return dbLogDir(nativeHandle_); } @Override public DBOptions setWalDir( final String walDir) { - assert(isInitialized()); + assert(isOwningHandle()); setWalDir(nativeHandle_, walDir); return this; } @Override public String walDir() { - assert(isInitialized()); + assert(isOwningHandle()); return walDir(nativeHandle_); } @Override public DBOptions setDeleteObsoleteFilesPeriodMicros( final long micros) { - assert(isInitialized()); + assert(isOwningHandle()); setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros); return this; } @Override public long deleteObsoleteFilesPeriodMicros() { - assert(isInitialized()); + assert(isOwningHandle()); return deleteObsoleteFilesPeriodMicros(nativeHandle_); } @Override public DBOptions setMaxBackgroundCompactions( final int maxBackgroundCompactions) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions); return this; } @Override public int maxBackgroundCompactions() { - assert(isInitialized()); + assert(isOwningHandle()); return maxBackgroundCompactions(nativeHandle_); } @Override public DBOptions setMaxBackgroundFlushes( final int maxBackgroundFlushes) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes); return this; } @Override public int maxBackgroundFlushes() { - assert(isInitialized()); + assert(isOwningHandle()); return maxBackgroundFlushes(nativeHandle_); } @Override public DBOptions setMaxLogFileSize( final long maxLogFileSize) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxLogFileSize(nativeHandle_, maxLogFileSize); return this; } @Override public long maxLogFileSize() { - assert(isInitialized()); + assert(isOwningHandle()); return maxLogFileSize(nativeHandle_); } @Override public DBOptions setLogFileTimeToRoll( final long logFileTimeToRoll) { - assert(isInitialized()); + assert(isOwningHandle()); setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll); return this; } @Override public long logFileTimeToRoll() { - assert(isInitialized()); + assert(isOwningHandle()); return logFileTimeToRoll(nativeHandle_); } @Override public DBOptions setKeepLogFileNum( final long keepLogFileNum) { - assert(isInitialized()); + assert(isOwningHandle()); setKeepLogFileNum(nativeHandle_, keepLogFileNum); return this; } @Override public long keepLogFileNum() { - assert(isInitialized()); + assert(isOwningHandle()); return keepLogFileNum(nativeHandle_); } @Override public DBOptions setMaxManifestFileSize( final long maxManifestFileSize) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxManifestFileSize(nativeHandle_, maxManifestFileSize); return this; } @Override public long maxManifestFileSize() { - assert(isInitialized()); + assert(isOwningHandle()); return maxManifestFileSize(nativeHandle_); } @Override public DBOptions setTableCacheNumshardbits( final int tableCacheNumshardbits) { - assert(isInitialized()); + assert(isOwningHandle()); setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits); return this; } @Override public int tableCacheNumshardbits() { - assert(isInitialized()); + assert(isOwningHandle()); return tableCacheNumshardbits(nativeHandle_); } @Override public DBOptions setWalTtlSeconds( final long walTtlSeconds) { - assert(isInitialized()); + assert(isOwningHandle()); setWalTtlSeconds(nativeHandle_, walTtlSeconds); return this; } @Override public long walTtlSeconds() { - assert(isInitialized()); + assert(isOwningHandle()); return walTtlSeconds(nativeHandle_); } @Override public DBOptions setWalSizeLimitMB( final long sizeLimitMB) { - assert(isInitialized()); + assert(isOwningHandle()); setWalSizeLimitMB(nativeHandle_, sizeLimitMB); return this; } @Override public long walSizeLimitMB() { - assert(isInitialized()); + assert(isOwningHandle()); return walSizeLimitMB(nativeHandle_); } @Override public DBOptions setManifestPreallocationSize( final long size) { - assert(isInitialized()); + assert(isOwningHandle()); setManifestPreallocationSize(nativeHandle_, size); return this; } @Override public long manifestPreallocationSize() { - assert(isInitialized()); + assert(isOwningHandle()); return manifestPreallocationSize(nativeHandle_); } @Override public DBOptions setAllowOsBuffer( final boolean allowOsBuffer) { - assert(isInitialized()); + assert(isOwningHandle()); setAllowOsBuffer(nativeHandle_, allowOsBuffer); return this; } @Override public boolean allowOsBuffer() { - assert(isInitialized()); + assert(isOwningHandle()); return allowOsBuffer(nativeHandle_); } @Override public DBOptions setAllowMmapReads( final boolean allowMmapReads) { - assert(isInitialized()); + assert(isOwningHandle()); setAllowMmapReads(nativeHandle_, allowMmapReads); return this; } @Override public boolean allowMmapReads() { - assert(isInitialized()); + assert(isOwningHandle()); return allowMmapReads(nativeHandle_); } @Override public DBOptions setAllowMmapWrites( final boolean allowMmapWrites) { - assert(isInitialized()); + assert(isOwningHandle()); setAllowMmapWrites(nativeHandle_, allowMmapWrites); return this; } @Override public boolean allowMmapWrites() { - assert(isInitialized()); + assert(isOwningHandle()); return allowMmapWrites(nativeHandle_); } @Override public DBOptions setIsFdCloseOnExec( final boolean isFdCloseOnExec) { - assert(isInitialized()); + assert(isOwningHandle()); setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec); return this; } @Override public boolean isFdCloseOnExec() { - assert(isInitialized()); + assert(isOwningHandle()); return isFdCloseOnExec(nativeHandle_); } @Override public DBOptions setStatsDumpPeriodSec( final int statsDumpPeriodSec) { - assert(isInitialized()); + assert(isOwningHandle()); setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec); return this; } @Override public int statsDumpPeriodSec() { - assert(isInitialized()); + assert(isOwningHandle()); return statsDumpPeriodSec(nativeHandle_); } @Override public DBOptions setAdviseRandomOnOpen( final boolean adviseRandomOnOpen) { - assert(isInitialized()); + assert(isOwningHandle()); setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen); return this; } @@ -510,21 +509,21 @@ public boolean adviseRandomOnOpen() { @Override public DBOptions setUseAdaptiveMutex( final boolean useAdaptiveMutex) { - assert(isInitialized()); + assert(isOwningHandle()); setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); return this; } @Override public boolean useAdaptiveMutex() { - assert(isInitialized()); + assert(isOwningHandle()); return useAdaptiveMutex(nativeHandle_); } @Override public DBOptions setBytesPerSync( final long bytesPerSync) { - assert(isInitialized()); + assert(isOwningHandle()); setBytesPerSync(nativeHandle_, bytesPerSync); return this; } @@ -534,33 +533,23 @@ public long bytesPerSync() { return bytesPerSync(nativeHandle_); } - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - static final int DEFAULT_NUM_SHARD_BITS = -1; /** *

Private constructor to be used by * {@link #getDBOptionsFromProps(java.util.Properties)}

* - * @param handle native handle to DBOptions instance. + * @param nativeHandle native handle to DBOptions instance. */ - private DBOptions(final long handle) { - super(); - nativeHandle_ = handle; + private DBOptions(final long nativeHandle) { + super(nativeHandle); } private static native long getDBOptionsFromProps( String optString); - private native void newDBOptions(); - private native void disposeInternal(long handle); + private native static long newDBOptions(); + @Override protected final native void disposeInternal(final long handle); private native void setIncreaseParallelism(long handle, int totalThreads); private native void setCreateIfMissing(long handle, boolean flag); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/external/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java index f710105a61..917e26ab08 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -565,7 +565,7 @@ public interface DBOptionsInterface { * are older than WAL_ttl_seconds will be deleted. *
  • If both are not 0, WAL files will be checked every 10 min and both * checks will be performed with ttl being first.
  • - * + * * * @param sizeLimitMB size limit in mega-bytes. * @return the instance of the current Object. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java index 47f4d7256a..d288047569 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,10 +15,19 @@ * using @see org.rocksdb.Comparator */ public abstract class DirectComparator extends AbstractComparator { + + private final long nativeHandle_; + public DirectComparator(final ComparatorOptions copt) { super(); - createNewDirectComparator0(copt.nativeHandle_); + this.nativeHandle_ = createNewDirectComparator0(copt.nativeHandle_); + } + + @Override + protected final long getNativeHandle() { + return nativeHandle_; } - private native void createNewDirectComparator0(final long comparatorOptionsHandle); + private native long createNewDirectComparator0( + final long comparatorOptionsHandle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java b/external/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java index 765b01586e..8f96eb49f6 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,7 +16,6 @@ * values consider using @see org.rocksdb.Slice */ public class DirectSlice extends AbstractSlice { - //TODO(AR) only needed by WriteBatchWithIndexTest until JDK8 public final static DirectSlice NONE = new DirectSlice(); /** @@ -24,17 +23,15 @@ public class DirectSlice extends AbstractSlice { * without an underlying C++ object set * at creation time. * - * Note: You should be aware that - * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally - * called from the default DirectSlice constructor, and that it is marked as - * package-private. This is so that developers cannot construct their own default - * DirectSlice objects (at present). As developers cannot construct their own - * DirectSlice objects through this, they are not creating underlying C++ - * DirectSlice objects, and so there is nothing to free (dispose) from Java. + * Note: You should be aware that it is intentionally marked as + * package-private. This is so that developers cannot construct their own + * default DirectSlice objects (at present). As developers cannot construct + * their own DirectSlice objects through this, they are not creating + * underlying C++ DirectSlice objects, and so there is nothing to free + * (dispose) from Java. */ DirectSlice() { super(); - disOwnNativeHandle(); } /** @@ -45,8 +42,7 @@ public class DirectSlice extends AbstractSlice { * @param str The string */ public DirectSlice(final String str) { - super(); - createNewSliceFromString(str); + super(createNewSliceFromString(str)); } /** @@ -58,9 +54,7 @@ public DirectSlice(final String str) { * @param length The length of the data to use for the slice */ public DirectSlice(final ByteBuffer data, final int length) { - super(); - assert(data.isDirect()); - createNewDirectSlice0(data, length); + super(createNewDirectSlice0(ensureDirect(data), length)); } /** @@ -71,9 +65,14 @@ public DirectSlice(final ByteBuffer data, final int length) { * @param data The bugger containing the data */ public DirectSlice(final ByteBuffer data) { - super(); + super(createNewDirectSlice1(ensureDirect(data))); + } + + private static ByteBuffer ensureDirect(final ByteBuffer data) { + // TODO(AR) consider throwing a checked exception, as if it's not direct + // this can SIGSEGV assert(data.isDirect()); - createNewDirectSlice1(data); + return data; } /** @@ -85,16 +84,14 @@ public DirectSlice(final ByteBuffer data) { * @return the requested byte */ public byte get(int offset) { - assert (isInitialized()); - return get0(nativeHandle_, offset); + return get0(getNativeHandle(), offset); } /** * Clears the backing slice */ public void clear() { - assert (isInitialized()); - clear0(nativeHandle_); + clear0(getNativeHandle()); } /** @@ -105,12 +102,12 @@ public void clear() { * @param n The number of bytes to drop */ public void removePrefix(final int n) { - assert (isInitialized()); - removePrefix0(nativeHandle_, n); + removePrefix0(getNativeHandle(), n); } - private native void createNewDirectSlice0(ByteBuffer data, int length); - private native void createNewDirectSlice1(ByteBuffer data); + private native static long createNewDirectSlice0(final ByteBuffer data, + final int length); + private native static long createNewDirectSlice1(final ByteBuffer data); @Override protected final native ByteBuffer data0(long handle); private native byte get0(long handle, int offset); private native void clear0(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java b/external/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java index d639542aa4..e27a9853ff 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Env.java b/external/rocksdb/java/src/main/java/org/rocksdb/Env.java index 929a394c36..7d30ea5df1 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Env.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Env.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -70,8 +70,8 @@ public int getThreadPoolQueueLen(final int poolID) { } - protected Env() { - super(); + protected Env(final long nativeHandle) { + super(nativeHandle); } static { diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Filter.java b/external/rocksdb/java/src/main/java/org/rocksdb/Filter.java index ce5c41f26d..01853d9694 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Filter.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Filter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,7 +13,10 @@ * DB::Get() call. */ public abstract class Filter extends RocksObject { - protected abstract void createNewFilter(); + + protected Filter(final long nativeHandle) { + super(nativeHandle); + } /** * Deletes underlying C++ filter pointer. @@ -22,10 +25,11 @@ public abstract class Filter extends RocksObject { * RocksDB instances referencing the filter are closed. * Otherwise an undefined behavior will occur. */ - @Override protected void disposeInternal() { - assert(isInitialized()); + @Override + protected void disposeInternal() { disposeInternal(nativeHandle_); } - private native void disposeInternal(long handle); + @Override + protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java index 9ddf95f1ca..4931b5d85b 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java @@ -10,8 +10,7 @@ public class FlushOptions extends RocksObject { * Construct a new instance of FlushOptions. */ public FlushOptions(){ - super(); - newFlushOptions(); + super(newFlushOptions()); } /** @@ -23,7 +22,7 @@ public FlushOptions(){ * @return instance of current FlushOptions. */ public FlushOptions setWaitForFlush(final boolean waitForFlush) { - assert(isInitialized()); + assert(isOwningHandle()); setWaitForFlush(nativeHandle_, waitForFlush); return this; } @@ -35,16 +34,12 @@ public FlushOptions setWaitForFlush(final boolean waitForFlush) { * waits for termination of the flush process. */ public boolean waitForFlush() { - assert(isInitialized()); + assert(isOwningHandle()); return waitForFlush(nativeHandle_); } - @Override protected void disposeInternal() { - disposeInternal(nativeHandle_); - } - - private native void newFlushOptions(); - private native void disposeInternal(long handle); + private native static long newFlushOptions(); + @Override protected final native void disposeInternal(final long handle); private native void setWaitForFlush(long handle, boolean wait); private native boolean waitForFlush(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java b/external/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java index 89951c5d1c..cc00c6f0ae 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java b/external/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java index 020a9c9a52..a920f4b4e2 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java b/external/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java index 9b45481082..a4459eecc8 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/IndexType.java b/external/rocksdb/java/src/main/java/org/rocksdb/IndexType.java index f3c1045660..db24a6f681 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/IndexType.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/IndexType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java b/external/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java index e67063c684..971c0b2ec5 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java @@ -9,7 +9,8 @@ public enum InfoLogLevel { WARN_LEVEL((byte)2), ERROR_LEVEL((byte)3), FATAL_LEVEL((byte)4), - NUM_INFO_LOG_LEVELS((byte)5); + HEADER_LEVEL((byte)5), + NUM_INFO_LOG_LEVELS((byte)6); private final byte value_; diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Logger.java b/external/rocksdb/java/src/main/java/org/rocksdb/Logger.java index 05c53b56e4..5db377dde1 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Logger.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Logger.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -35,7 +35,9 @@ * {@link org.rocksdb.InfoLogLevel#FATAL_LEVEL}. *

    */ -public abstract class Logger extends RocksObject { +public abstract class Logger extends AbstractImmutableNativeReference { + + final long nativeHandle_; /** *

    AbstractLogger constructor.

    @@ -47,7 +49,8 @@ public abstract class Logger extends RocksObject { * @param options {@link org.rocksdb.Options} instance. */ public Logger(final Options options) { - createNewLoggerOptions(options.nativeHandle_); + super(true); + this.nativeHandle_ = createNewLoggerOptions(options.nativeHandle_); } /** @@ -60,7 +63,8 @@ public Logger(final Options options) { * @param dboptions {@link org.rocksdb.DBOptions} instance. */ public Logger(final DBOptions dboptions) { - createNewLoggerDbOptions(dboptions.nativeHandle_); + super(true); + this.nativeHandle_ = createNewLoggerDbOptions(dboptions.nativeHandle_); } /** @@ -93,16 +97,15 @@ protected abstract void log(InfoLogLevel infoLogLevel, */ @Override protected void disposeInternal() { - assert(isInitialized()); disposeInternal(nativeHandle_); } - protected native void createNewLoggerOptions( + protected native long createNewLoggerOptions( long options); - protected native void createNewLoggerDbOptions( + protected native long createNewLoggerDbOptions( long dbOptions); protected native void setInfoLogLevel(long handle, byte infoLogLevel); protected native byte infoLogLevel(long handle); - private native void disposeInternal(long handle); + private native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java b/external/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java index 7c34826e1d..8b854917f9 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/external/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java index dca9b3119f..96d364cd1c 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java @@ -19,7 +19,7 @@ public class NativeLibraryLoader { private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb"); private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb"); private static final String tempFilePrefix = "librocksdbjni"; - private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension(); + private static final String tempFileSuffix = Environment.getJniLibraryExtension(); /** * Get a reference to the NativeLibraryLoader @@ -75,37 +75,47 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException { void loadLibraryFromJar(final String tmpDir) throws IOException { if (!initialized) { - final File temp; - if (tmpDir == null || tmpDir.equals("")) { - temp = File.createTempFile(tempFilePrefix, tempFileSuffix); - } else { - temp = new File(tmpDir, jniLibraryFileName); - if (!temp.createNewFile()) { - throw new RuntimeException("File: " + temp.getAbsolutePath() - + " could not be created."); - } - } + System.load(loadLibraryFromJarToTemp(tmpDir).getAbsolutePath()); + initialized = true; + } + } - if (!temp.exists()) { - throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); - } else { - temp.deleteOnExit(); + File loadLibraryFromJarToTemp(final String tmpDir) + throws IOException { + final File temp; + if (tmpDir == null || tmpDir.isEmpty()) { + temp = File.createTempFile(tempFilePrefix, tempFileSuffix); + } else { + temp = new File(tmpDir, jniLibraryFileName); + if (temp.exists() && !temp.delete()) { + throw new RuntimeException("File: " + temp.getAbsolutePath() + + " already exists and cannot be removed."); } - - // attempt to copy the library from the Jar file to the temp destination - try (final InputStream is = getClass().getClassLoader(). - getResourceAsStream(jniLibraryFileName)) { - if (is == null) { - throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); - } else { - Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); - } + if (!temp.createNewFile()) { + throw new RuntimeException("File: " + temp.getAbsolutePath() + + " could not be created."); } + } - System.load(temp.getAbsolutePath()); - initialized = true; + if (!temp.exists()) { + throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); + } else { + temp.deleteOnExit(); } + + // attempt to copy the library from the Jar file to the temp destination + try (final InputStream is = getClass().getClassLoader(). + getResourceAsStream(jniLibraryFileName)) { + if (is == null) { + throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); + } else { + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + } + + return temp; } + /** * Private constructor to disallow instantiation */ diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Options.java b/external/rocksdb/java/src/main/java/org/rocksdb/Options.java index 771de0ac63..dcc512f2db 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Options.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Options.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,8 +12,8 @@ * Options to control the behavior of a database. It will be used * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). * - * If {@link #dispose()} function is not called, then it will be GC'd automatically - * and native resources will be released as part of the process. + * If {@link #dispose()} function is not called, then it will be GC'd + * automaticallyand native resources will be released as part of the process. */ public class Options extends RocksObject implements DBOptionsInterface, ColumnFamilyOptionsInterface { @@ -27,8 +27,7 @@ public class Options extends RocksObject * an {@code rocksdb::Options} in the c++ side. */ public Options() { - super(); - newOptions(); + super(newOptions()); env_ = Env.getDefault(); } @@ -42,28 +41,28 @@ public Options() { */ public Options(final DBOptions dbOptions, final ColumnFamilyOptions columnFamilyOptions) { - super(); - newOptions(dbOptions.nativeHandle_, columnFamilyOptions.nativeHandle_); + super(newOptions(dbOptions.nativeHandle_, + columnFamilyOptions.nativeHandle_)); env_ = Env.getDefault(); } @Override public Options setIncreaseParallelism(final int totalThreads) { - assert(isInitialized()); + assert(isOwningHandle()); setIncreaseParallelism(nativeHandle_, totalThreads); return this; } @Override public Options setCreateIfMissing(final boolean flag) { - assert(isInitialized()); + assert(isOwningHandle()); setCreateIfMissing(nativeHandle_, flag); return this; } @Override public Options setCreateMissingColumnFamilies(final boolean flag) { - assert(isInitialized()); + assert(isOwningHandle()); setCreateMissingColumnFamilies(nativeHandle_, flag); return this; } @@ -77,7 +76,7 @@ public Options setCreateMissingColumnFamilies(final boolean flag) { * @return the instance of the current Options. */ public Options setEnv(final Env env) { - assert(isInitialized()); + assert(isOwningHandle()); setEnv(nativeHandle_, env.nativeHandle_); env_ = env; return this; @@ -111,13 +110,13 @@ public Options prepareForBulkLoad() { @Override public boolean createIfMissing() { - assert(isInitialized()); + assert(isOwningHandle()); return createIfMissing(nativeHandle_); } @Override public boolean createMissingColumnFamilies() { - assert(isInitialized()); + assert(isOwningHandle()); return createMissingColumnFamilies(nativeHandle_); } @@ -161,7 +160,7 @@ public Options optimizeUniversalStyleCompaction( @Override public Options setComparator(final BuiltinComparator builtinComparator) { - assert(isInitialized()); + assert(isOwningHandle()); setComparatorHandle(nativeHandle_, builtinComparator.ordinal()); return this; } @@ -169,15 +168,15 @@ public Options setComparator(final BuiltinComparator builtinComparator) { @Override public Options setComparator( final AbstractComparator> comparator) { - assert (isInitialized()); - setComparatorHandle(nativeHandle_, comparator.nativeHandle_); + assert(isOwningHandle()); + setComparatorHandle(nativeHandle_, comparator.getNativeHandle()); comparator_ = comparator; return this; } @Override public Options setMergeOperatorName(final String name) { - assert (isInitialized()); + assert(isOwningHandle()); if (name == null) { throw new IllegalArgumentException( "Merge operator name must not be null."); @@ -194,164 +193,164 @@ public Options setMergeOperator(final MergeOperator mergeOperator) { @Override public Options setWriteBufferSize(final long writeBufferSize) { - assert(isInitialized()); + assert(isOwningHandle()); setWriteBufferSize(nativeHandle_, writeBufferSize); return this; } @Override public long writeBufferSize() { - assert(isInitialized()); + assert(isOwningHandle()); return writeBufferSize(nativeHandle_); } @Override public Options setMaxWriteBufferNumber(final int maxWriteBufferNumber) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber); return this; } @Override public int maxWriteBufferNumber() { - assert(isInitialized()); + assert(isOwningHandle()); return maxWriteBufferNumber(nativeHandle_); } @Override public boolean errorIfExists() { - assert(isInitialized()); + assert(isOwningHandle()); return errorIfExists(nativeHandle_); } @Override public Options setErrorIfExists(final boolean errorIfExists) { - assert(isInitialized()); + assert(isOwningHandle()); setErrorIfExists(nativeHandle_, errorIfExists); return this; } @Override public boolean paranoidChecks() { - assert(isInitialized()); + assert(isOwningHandle()); return paranoidChecks(nativeHandle_); } @Override public Options setParanoidChecks(final boolean paranoidChecks) { - assert(isInitialized()); + assert(isOwningHandle()); setParanoidChecks(nativeHandle_, paranoidChecks); return this; } @Override public int maxOpenFiles() { - assert(isInitialized()); + assert(isOwningHandle()); return maxOpenFiles(nativeHandle_); } @Override public Options setMaxTotalWalSize(final long maxTotalWalSize) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxTotalWalSize(nativeHandle_, maxTotalWalSize); return this; } @Override public long maxTotalWalSize() { - assert(isInitialized()); + assert(isOwningHandle()); return maxTotalWalSize(nativeHandle_); } @Override public Options setMaxOpenFiles(final int maxOpenFiles) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxOpenFiles(nativeHandle_, maxOpenFiles); return this; } @Override public boolean disableDataSync() { - assert(isInitialized()); + assert(isOwningHandle()); return disableDataSync(nativeHandle_); } @Override public Options setDisableDataSync(final boolean disableDataSync) { - assert(isInitialized()); + assert(isOwningHandle()); setDisableDataSync(nativeHandle_, disableDataSync); return this; } @Override public boolean useFsync() { - assert(isInitialized()); + assert(isOwningHandle()); return useFsync(nativeHandle_); } @Override public Options setUseFsync(final boolean useFsync) { - assert(isInitialized()); + assert(isOwningHandle()); setUseFsync(nativeHandle_, useFsync); return this; } @Override public String dbLogDir() { - assert(isInitialized()); + assert(isOwningHandle()); return dbLogDir(nativeHandle_); } @Override public Options setDbLogDir(final String dbLogDir) { - assert(isInitialized()); + assert(isOwningHandle()); setDbLogDir(nativeHandle_, dbLogDir); return this; } @Override public String walDir() { - assert(isInitialized()); + assert(isOwningHandle()); return walDir(nativeHandle_); } @Override public Options setWalDir(final String walDir) { - assert(isInitialized()); + assert(isOwningHandle()); setWalDir(nativeHandle_, walDir); return this; } @Override public long deleteObsoleteFilesPeriodMicros() { - assert(isInitialized()); + assert(isOwningHandle()); return deleteObsoleteFilesPeriodMicros(nativeHandle_); } @Override public Options setDeleteObsoleteFilesPeriodMicros( final long micros) { - assert(isInitialized()); + assert(isOwningHandle()); setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros); return this; } @Override public int maxBackgroundCompactions() { - assert(isInitialized()); + assert(isOwningHandle()); return maxBackgroundCompactions(nativeHandle_); } @Override public Options createStatistics() { - assert(isInitialized()); + assert(isOwningHandle()); createStatistics(nativeHandle_); return this; } @Override public Statistics statisticsPtr() { - assert(isInitialized()); + assert(isOwningHandle()); long statsPtr = statisticsPtr(nativeHandle_); if(statsPtr == 0) { @@ -365,74 +364,74 @@ public Statistics statisticsPtr() { @Override public Options setMaxBackgroundCompactions( final int maxBackgroundCompactions) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions); return this; } @Override public int maxBackgroundFlushes() { - assert(isInitialized()); + assert(isOwningHandle()); return maxBackgroundFlushes(nativeHandle_); } @Override public Options setMaxBackgroundFlushes( final int maxBackgroundFlushes) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes); return this; } @Override public long maxLogFileSize() { - assert(isInitialized()); + assert(isOwningHandle()); return maxLogFileSize(nativeHandle_); } @Override public Options setMaxLogFileSize(final long maxLogFileSize) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxLogFileSize(nativeHandle_, maxLogFileSize); return this; } @Override public long logFileTimeToRoll() { - assert(isInitialized()); + assert(isOwningHandle()); return logFileTimeToRoll(nativeHandle_); } @Override public Options setLogFileTimeToRoll(final long logFileTimeToRoll) { - assert(isInitialized()); + assert(isOwningHandle()); setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll); return this; } @Override public long keepLogFileNum() { - assert(isInitialized()); + assert(isOwningHandle()); return keepLogFileNum(nativeHandle_); } @Override public Options setKeepLogFileNum(final long keepLogFileNum) { - assert(isInitialized()); + assert(isOwningHandle()); setKeepLogFileNum(nativeHandle_, keepLogFileNum); return this; } @Override public long maxManifestFileSize() { - assert(isInitialized()); + assert(isOwningHandle()); return maxManifestFileSize(nativeHandle_); } @Override public Options setMaxManifestFileSize( final long maxManifestFileSize) { - assert(isInitialized()); + assert(isOwningHandle()); setMaxManifestFileSize(nativeHandle_, maxManifestFileSize); return this; } @@ -441,7 +440,7 @@ public Options setMaxManifestFileSize( public Options setMaxTableFilesSizeFIFO( final long maxTableFilesSize) { assert(maxTableFilesSize > 0); // unsigned native type - assert(isInitialized()); + assert(isOwningHandle()); setMaxTableFilesSizeFIFO(nativeHandle_, maxTableFilesSize); return this; } @@ -453,118 +452,118 @@ public long maxTableFilesSizeFIFO() { @Override public int tableCacheNumshardbits() { - assert(isInitialized()); + assert(isOwningHandle()); return tableCacheNumshardbits(nativeHandle_); } @Override public Options setTableCacheNumshardbits( final int tableCacheNumshardbits) { - assert(isInitialized()); + assert(isOwningHandle()); setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits); return this; } @Override public long walTtlSeconds() { - assert(isInitialized()); + assert(isOwningHandle()); return walTtlSeconds(nativeHandle_); } @Override public Options setWalTtlSeconds(final long walTtlSeconds) { - assert(isInitialized()); + assert(isOwningHandle()); setWalTtlSeconds(nativeHandle_, walTtlSeconds); return this; } @Override public long walSizeLimitMB() { - assert(isInitialized()); + assert(isOwningHandle()); return walSizeLimitMB(nativeHandle_); } @Override public Options setWalSizeLimitMB(final long sizeLimitMB) { - assert(isInitialized()); + assert(isOwningHandle()); setWalSizeLimitMB(nativeHandle_, sizeLimitMB); return this; } @Override public long manifestPreallocationSize() { - assert(isInitialized()); + assert(isOwningHandle()); return manifestPreallocationSize(nativeHandle_); } @Override public Options setManifestPreallocationSize(final long size) { - assert(isInitialized()); + assert(isOwningHandle()); setManifestPreallocationSize(nativeHandle_, size); return this; } @Override public boolean allowOsBuffer() { - assert(isInitialized()); + assert(isOwningHandle()); return allowOsBuffer(nativeHandle_); } @Override public Options setAllowOsBuffer(final boolean allowOsBuffer) { - assert(isInitialized()); + assert(isOwningHandle()); setAllowOsBuffer(nativeHandle_, allowOsBuffer); return this; } @Override public boolean allowMmapReads() { - assert(isInitialized()); + assert(isOwningHandle()); return allowMmapReads(nativeHandle_); } @Override public Options setAllowMmapReads(final boolean allowMmapReads) { - assert(isInitialized()); + assert(isOwningHandle()); setAllowMmapReads(nativeHandle_, allowMmapReads); return this; } @Override public boolean allowMmapWrites() { - assert(isInitialized()); + assert(isOwningHandle()); return allowMmapWrites(nativeHandle_); } @Override public Options setAllowMmapWrites(final boolean allowMmapWrites) { - assert(isInitialized()); + assert(isOwningHandle()); setAllowMmapWrites(nativeHandle_, allowMmapWrites); return this; } @Override public boolean isFdCloseOnExec() { - assert(isInitialized()); + assert(isOwningHandle()); return isFdCloseOnExec(nativeHandle_); } @Override public Options setIsFdCloseOnExec(final boolean isFdCloseOnExec) { - assert(isInitialized()); + assert(isOwningHandle()); setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec); return this; } @Override public int statsDumpPeriodSec() { - assert(isInitialized()); + assert(isOwningHandle()); return statsDumpPeriodSec(nativeHandle_); } @Override public Options setStatsDumpPeriodSec(final int statsDumpPeriodSec) { - assert(isInitialized()); + assert(isOwningHandle()); setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec); return this; } @@ -576,20 +575,20 @@ public boolean adviseRandomOnOpen() { @Override public Options setAdviseRandomOnOpen(final boolean adviseRandomOnOpen) { - assert(isInitialized()); + assert(isOwningHandle()); setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen); return this; } @Override public boolean useAdaptiveMutex() { - assert(isInitialized()); + assert(isOwningHandle()); return useAdaptiveMutex(nativeHandle_); } @Override public Options setUseAdaptiveMutex(final boolean useAdaptiveMutex) { - assert(isInitialized()); + assert(isOwningHandle()); setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); return this; } @@ -601,7 +600,7 @@ public long bytesPerSync() { @Override public Options setBytesPerSync(final long bytesPerSync) { - assert(isInitialized()); + assert(isOwningHandle()); setBytesPerSync(nativeHandle_, bytesPerSync); return this; } @@ -622,28 +621,28 @@ public Options setRateLimiterConfig(final RateLimiterConfig config) { @Override public Options setLogger(final Logger logger) { - assert(isInitialized()); + assert(isOwningHandle()); setLogger(nativeHandle_, logger.nativeHandle_); return this; } @Override public Options setInfoLogLevel(final InfoLogLevel infoLogLevel) { - assert(isInitialized()); + assert(isOwningHandle()); setInfoLogLevel(nativeHandle_, infoLogLevel.getValue()); return this; } @Override public InfoLogLevel infoLogLevel() { - assert(isInitialized()); + assert(isOwningHandle()); return InfoLogLevel.getInfoLogLevel( infoLogLevel(nativeHandle_)); } @Override public String memTableFactoryName() { - assert(isInitialized()); + assert(isOwningHandle()); return memTableFactoryName(nativeHandle_); } @@ -656,20 +655,20 @@ public Options setTableFormatConfig(final TableFormatConfig config) { @Override public String tableFactoryName() { - assert(isInitialized()); + assert(isOwningHandle()); return tableFactoryName(nativeHandle_); } @Override public Options useFixedLengthPrefixExtractor(final int n) { - assert(isInitialized()); + assert(isOwningHandle()); useFixedLengthPrefixExtractor(nativeHandle_, n); return this; } @Override public Options useCappedPrefixExtractor(final int n) { - assert(isInitialized()); + assert(isOwningHandle()); useCappedPrefixExtractor(nativeHandle_, n); return this; } @@ -680,11 +679,12 @@ public CompressionType compressionType() { } @Override - public Options setCompressionPerLevel(final List compressionLevels) { - final List byteCompressionTypes = new ArrayList<>( - compressionLevels.size()); - for (final CompressionType compressionLevel : compressionLevels) { - byteCompressionTypes.add(compressionLevel.getValue()); + public Options setCompressionPerLevel( + final List compressionLevels) { + final byte[] byteCompressionTypes = new byte[ + compressionLevels.size()]; + for (int i = 0; i < compressionLevels.size(); i++) { + byteCompressionTypes[i] = compressionLevels.get(i).getValue(); } setCompressionPerLevel(nativeHandle_, byteCompressionTypes); return this; @@ -692,7 +692,7 @@ public Options setCompressionPerLevel(final List compressionLev @Override public List compressionPerLevel() { - final List byteCompressionTypes = + final byte[] byteCompressionTypes = compressionPerLevel(nativeHandle_); final List compressionLevels = new ArrayList<>(); for (final Byte byteCompressionType : byteCompressionTypes) { @@ -955,18 +955,6 @@ public Options setVerifyChecksumsInCompaction( return this; } - @Override - public boolean filterDeletes() { - return filterDeletes(nativeHandle_); - } - - @Override - public Options setFilterDeletes( - final boolean filterDeletes) { - setFilterDeletes(nativeHandle_, filterDeletes); - return this; - } - @Override public long maxSequentialSkipInIterations() { return maxSequentialSkipInIterations(nativeHandle_); @@ -975,7 +963,8 @@ public long maxSequentialSkipInIterations() { @Override public Options setMaxSequentialSkipInIterations( final long maxSequentialSkipInIterations) { - setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); + setMaxSequentialSkipInIterations(nativeHandle_, + maxSequentialSkipInIterations); return this; } @@ -1004,26 +993,13 @@ public Options setInplaceUpdateNumLocks( } @Override - public int memtablePrefixBloomBits() { - return memtablePrefixBloomBits(nativeHandle_); - } - - @Override - public Options setMemtablePrefixBloomBits( - final int memtablePrefixBloomBits) { - setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); - return this; - } - - @Override - public int memtablePrefixBloomProbes() { - return memtablePrefixBloomProbes(nativeHandle_); + public double memtablePrefixBloomSizeRatio() { + return memtablePrefixBloomSizeRatio(nativeHandle_); } @Override - public Options setMemtablePrefixBloomProbes( - final int memtablePrefixBloomProbes) { - setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); + public Options setMemtablePrefixBloomSizeRatio(final double memtablePrefixBloomSizeRatio) { + setMemtablePrefixBloomSizeRatio(nativeHandle_, memtablePrefixBloomSizeRatio); return this; } @@ -1085,19 +1061,10 @@ public boolean optimizeFiltersForHits() { return optimizeFiltersForHits(nativeHandle_); } - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - private native void newOptions(); - private native void newOptions(long dbOptHandle, + private native static long newOptions(); + private native static long newOptions(long dbOptHandle, long cfOptHandle); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); private native void setEnv(long optHandle, long envHandle); private native void prepareForBulkLoad(long handle); @@ -1200,7 +1167,8 @@ private native void optimizeLevelStyleCompaction(long handle, private native void optimizeUniversalStyleCompaction(long handle, long memtableMemoryBudget); private native void setComparatorHandle(long handle, int builtinComparator); - private native void setComparatorHandle(long optHandle, long comparatorHandle); + private native void setComparatorHandle(long optHandle, + long comparatorHandle); private native void setMergeOperatorName( long handle, String name); private native void setMergeOperator( @@ -1217,8 +1185,8 @@ private native void setMinWriteBufferNumberToMerge( private native void setCompressionType(long handle, byte compressionType); private native byte compressionType(long handle); private native void setCompressionPerLevel(long handle, - List compressionLevels); - private native List compressionPerLevel(long handle); + byte[] compressionLevels); + private native byte[] compressionPerLevel(long handle); private native void useFixedLengthPrefixExtractor( long handle, int prefixLength); private native void useCappedPrefixExtractor( @@ -1283,9 +1251,6 @@ private native void setPurgeRedundantKvsWhileFlush( private native void setVerifyChecksumsInCompaction( long handle, boolean verifyChecksumsInCompaction); private native boolean verifyChecksumsInCompaction(long handle); - private native void setFilterDeletes( - long handle, boolean filterDeletes); - private native boolean filterDeletes(long handle); private native void setMaxSequentialSkipInIterations( long handle, long maxSequentialSkipInIterations); private native long maxSequentialSkipInIterations(long handle); @@ -1300,12 +1265,9 @@ private native void setInplaceUpdateNumLocks( long handle, long inplaceUpdateNumLocks) throws IllegalArgumentException; private native long inplaceUpdateNumLocks(long handle); - private native void setMemtablePrefixBloomBits( - long handle, int memtablePrefixBloomBits); - private native int memtablePrefixBloomBits(long handle); - private native void setMemtablePrefixBloomProbes( - long handle, int memtablePrefixBloomProbes); - private native int memtablePrefixBloomProbes(long handle); + private native void setMemtablePrefixBloomSizeRatio( + long handle, double memtablePrefixBloomSizeRatio); + private native double memtablePrefixBloomSizeRatio(long handle); private native void setBloomLocality( long handle, int bloomLocality); private native int bloomLocality(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java b/external/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java index 3a41bea847..044c18d803 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java b/external/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java index 09d1c7a04a..d2e7459e35 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java index a72a6e0d89..9bb23d0139 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,10 +13,8 @@ */ public class ReadOptions extends RocksObject { public ReadOptions() { - super(); - newReadOptions(); + super(newReadOptions()); } - private native void newReadOptions(); /** * If true, all data read from underlying storage will be @@ -26,10 +24,9 @@ public ReadOptions() { * @return true if checksum verification is on. */ public boolean verifyChecksums() { - assert(isInitialized()); + assert(isOwningHandle()); return verifyChecksums(nativeHandle_); } - private native boolean verifyChecksums(long handle); /** * If true, all data read from underlying storage will be @@ -42,12 +39,10 @@ public boolean verifyChecksums() { */ public ReadOptions setVerifyChecksums( final boolean verifyChecksums) { - assert(isInitialized()); + assert(isOwningHandle()); setVerifyChecksums(nativeHandle_, verifyChecksums); return this; } - private native void setVerifyChecksums( - long handle, boolean verifyChecksums); // TODO(yhchiang): this option seems to be block-based table only. // move this to a better place? @@ -59,10 +54,9 @@ private native void setVerifyChecksums( * @return true if the fill-cache behavior is on. */ public boolean fillCache() { - assert(isInitialized()); + assert(isOwningHandle()); return fillCache(nativeHandle_); } - private native boolean fillCache(long handle); /** * Fill the cache when loading the block-based sst formatted db. @@ -74,12 +68,25 @@ public boolean fillCache() { * @return the reference to the current ReadOptions. */ public ReadOptions setFillCache(final boolean fillCache) { - assert(isInitialized()); + assert(isOwningHandle()); setFillCache(nativeHandle_, fillCache); return this; } - private native void setFillCache( - long handle, boolean fillCache); + + /** + * Returns the currently assigned Snapshot instance. + * + * @return the Snapshot assigned to this instance. If no Snapshot + * is assigned null. + */ + public Snapshot snapshot() { + assert(isOwningHandle()); + long snapshotHandle = snapshot(nativeHandle_); + if (snapshotHandle != 0) { + return new Snapshot(snapshotHandle); + } + return null; + } /** *

    If "snapshot" is non-nullptr, read as of the supplied snapshot @@ -92,7 +99,7 @@ private native void setFillCache( * @return the reference to the current ReadOptions. */ public ReadOptions setSnapshot(final Snapshot snapshot) { - assert(isInitialized()); + assert(isOwningHandle()); if (snapshot != null) { setSnapshot(nativeHandle_, snapshot.nativeHandle_); } else { @@ -100,23 +107,30 @@ public ReadOptions setSnapshot(final Snapshot snapshot) { } return this; } - private native void setSnapshot(long handle, long snapshotHandle); /** - * Returns the currently assigned Snapshot instance. + * Returns the current read tier. * - * @return the Snapshot assigned to this instance. If no Snapshot - * is assigned null. + * @return the read tier in use, by default {@link ReadTier#READ_ALL_TIER} */ - public Snapshot snapshot() { - assert(isInitialized()); - long snapshotHandle = snapshot(nativeHandle_); - if (snapshotHandle != 0) { - return new Snapshot(snapshotHandle); - } - return null; + public ReadTier readTier() { + assert(isOwningHandle()); + return ReadTier.getReadTier(readTier(nativeHandle_)); + } + + /** + * Specify if this read request should process data that ALREADY + * resides on a particular cache. If the required data is not + * found at the specified cache, then {@link RocksDBException} is thrown. + * + * @param readTier {@link ReadTier} instance + * @return the reference to the current ReadOptions. + */ + public ReadOptions setReadTier(final ReadTier readTier) { + assert(isOwningHandle()); + setReadTier(nativeHandle_, readTier.getValue()); + return this; } - private native long snapshot(long handle); /** * Specify to create a tailing iterator -- a special iterator that has a @@ -130,10 +144,9 @@ public Snapshot snapshot() { * @return true if tailing iterator is enabled. */ public boolean tailing() { - assert(isInitialized()); + assert(isOwningHandle()); return tailing(nativeHandle_); } - private native boolean tailing(long handle); /** * Specify to create a tailing iterator -- a special iterator that has a @@ -147,17 +160,135 @@ public boolean tailing() { * @return the reference to the current ReadOptions. */ public ReadOptions setTailing(final boolean tailing) { - assert(isInitialized()); + assert(isOwningHandle()); setTailing(nativeHandle_, tailing); return this; } - private native void setTailing( - long handle, boolean tailing); + /** + * Returns whether managed iterators will be used. + * + * @return the setting of whether managed iterators will be used, by default false + */ + public boolean managed() { + assert(isOwningHandle()); + return managed(nativeHandle_); + } + + /** + * Specify to create a managed iterator -- a special iterator that + * uses less resources by having the ability to free its underlying + * resources on request. + * + * @param managed if true, then managed iterators will be enabled. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setManaged(final boolean managed) { + assert(isOwningHandle()); + setManaged(nativeHandle_, managed); + return this; + } + + /** + * Returns whether a total seek order will be used + * + * @return the setting of whether a total seek order will be used + */ + public boolean totalOrderSeek() { + assert(isOwningHandle()); + return totalOrderSeek(nativeHandle_); + } + + /** + * Enable a total order seek regardless of index format (e.g. hash index) + * used in the table. Some table format (e.g. plain table) may not support + * this option. + * + * @param totalOrderSeek if true, then total order seek will be enabled. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setTotalOrderSeek(final boolean totalOrderSeek) { + assert(isOwningHandle()); + setTotalOrderSeek(nativeHandle_, totalOrderSeek); + return this; + } + + /** + * Returns whether the iterator only iterates over the same prefix as the seek + * + * @return the setting of whether the iterator only iterates over the same + * prefix as the seek, default is false + */ + public boolean prefixSameAsStart() { + assert(isOwningHandle()); + return prefixSameAsStart(nativeHandle_); + } + + + /** + * Enforce that the iterator only iterates over the same prefix as the seek. + * This option is effective only for prefix seeks, i.e. prefix_extractor is + * non-null for the column family and {@link #totalOrderSeek()} is false. + * Unlike iterate_upper_bound, {@link #setPrefixSameAsStart(boolean)} only + * works within a prefix but in both directions. + * + * @param prefixSameAsStart if true, then the iterator only iterates over the + * same prefix as the seek + * @return the reference to the current ReadOptions. + */ + public ReadOptions setPrefixSameAsStart(final boolean prefixSameAsStart) { + assert(isOwningHandle()); + setPrefixSameAsStart(nativeHandle_, prefixSameAsStart); + return this; + } + + /** + * Returns whether the blocks loaded by the iterator will be pinned in memory + * + * @return the setting of whether the blocks loaded by the iterator will be + * pinned in memory + */ + public boolean pinData() { + assert(isOwningHandle()); + return pinData(nativeHandle_); + } - @Override protected void disposeInternal() { - disposeInternal(nativeHandle_); + /** + * Keep the blocks loaded by the iterator pinned in memory as long as the + * iterator is not deleted, If used when reading from tables created with + * BlockBasedTableOptions::use_delta_encoding = false, + * Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to + * return 1. + * + * @param pinData if true, the blocks loaded by the iterator will be pinned + * @return the reference to the current ReadOptions. + */ + public ReadOptions setPinData(final boolean pinData) { + assert(isOwningHandle()); + setPinData(nativeHandle_, pinData); + return this; } - private native void disposeInternal(long handle); + + private native static long newReadOptions(); + private native boolean verifyChecksums(long handle); + private native void setVerifyChecksums(long handle, boolean verifyChecksums); + private native boolean fillCache(long handle); + private native void setFillCache(long handle, boolean fillCache); + private native long snapshot(long handle); + private native void setSnapshot(long handle, long snapshotHandle); + private native byte readTier(long handle); + private native void setReadTier(long handle, byte readTierValue); + private native boolean tailing(long handle); + private native void setTailing(long handle, boolean tailing); + private native boolean managed(long handle); + private native void setManaged(long handle, boolean managed); + private native boolean totalOrderSeek(long handle); + private native void setTotalOrderSeek(long handle, boolean totalOrderSeek); + private native boolean prefixSameAsStart(long handle); + private native void setPrefixSameAsStart(long handle, boolean prefixSameAsStart); + private native boolean pinData(long handle); + private native void setPinData(long handle, boolean pinData); + + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java b/external/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java new file mode 100644 index 0000000000..c6f48214d9 --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * RocksDB {@link ReadOptions} read tiers. + */ +public enum ReadTier { + READ_ALL_TIER((byte)0), + BLOCK_CACHE_TIER((byte)1), + PERSISTED_TIER((byte)2); + + private final byte value; + + ReadTier(final byte value) { + this.value = value; + } + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + public byte getValue() { + return value; + } + + /** + * Get ReadTier by byte value. + * + * @param value byte representation of ReadTier. + * + * @return {@link org.rocksdb.ReadTier} instance or null. + * @throws java.lang.IllegalArgumentException if an invalid + * value is provided. + */ + public static ReadTier getReadTier(final byte value) { + for (final ReadTier readTier : ReadTier.values()) { + if (readTier.getValue() == value){ + return readTier; + } + } + throw new IllegalArgumentException("Illegal value provided for ReadTier."); + } +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java b/external/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java index 61c46131b8..1beb45c46f 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,11 +8,11 @@ /** * Just a Java wrapper around EmptyValueCompactionFilter implemented in C++ */ -public class RemoveEmptyValueCompactionFilter extends AbstractCompactionFilter { +public class RemoveEmptyValueCompactionFilter + extends AbstractCompactionFilter { public RemoveEmptyValueCompactionFilter() { - super(); - createNewRemoveEmptyValueCompactionFilter0(); + super(createNewRemoveEmptyValueCompactionFilter0()); } - private native void createNewRemoveEmptyValueCompactionFilter0(); + private native static long createNewRemoveEmptyValueCompactionFilter0(); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java b/external/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java index 5a3b2fc9af..f303b15070 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,8 +23,7 @@ public class RestoreBackupableDB extends RocksObject { * @param options {@link org.rocksdb.BackupableDBOptions} instance */ public RestoreBackupableDB(final BackupableDBOptions options) { - super(); - nativeHandle_ = newRestoreBackupableDB(options.nativeHandle_); + super(newRestoreBackupableDB(options.nativeHandle_)); } /** @@ -52,7 +51,7 @@ public RestoreBackupableDB(final BackupableDBOptions options) { public void restoreDBFromBackup(final long backupId, final String dbDir, final String walDir, final RestoreOptions restoreOptions) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); restoreDBFromBackup0(nativeHandle_, backupId, dbDir, walDir, restoreOptions.nativeHandle_); } @@ -70,7 +69,7 @@ public void restoreDBFromBackup(final long backupId, final String dbDir, public void restoreDBFromLatestBackup(final String dbDir, final String walDir, final RestoreOptions restoreOptions) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); restoreDBFromLatestBackup0(nativeHandle_, dbDir, walDir, restoreOptions.nativeHandle_); } @@ -85,7 +84,7 @@ public void restoreDBFromLatestBackup(final String dbDir, */ public void purgeOldBackups(final int numBackupsToKeep) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); purgeOldBackups0(nativeHandle_, numBackupsToKeep); } @@ -99,7 +98,7 @@ public void purgeOldBackups(final int numBackupsToKeep) */ public void deleteBackup(final int backupId) throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); deleteBackup0(nativeHandle_, backupId); } @@ -110,7 +109,7 @@ public void deleteBackup(final int backupId) * @return List of {@link BackupInfo} instances. */ public List getBackupInfos() { - assert(isInitialized()); + assert(isOwningHandle()); return getBackupInfo(nativeHandle_); } @@ -122,7 +121,7 @@ public List getBackupInfos() { * @return array of backup ids as int ids. */ public int[] getCorruptedBackups() { - assert(isInitialized()); + assert(isOwningHandle()); return getCorruptedBackups(nativeHandle_); } @@ -135,19 +134,11 @@ public int[] getCorruptedBackups() { * native library. */ public void garbageCollect() throws RocksDBException { - assert(isInitialized()); + assert(isOwningHandle()); garbageCollect(nativeHandle_); } - /** - *

    Release the memory allocated for the current instance - * in the c++ side.

    - */ - @Override public synchronized void disposeInternal() { - dispose(nativeHandle_); - } - - private native long newRestoreBackupableDB(long options); + private native static long newRestoreBackupableDB(final long options); private native void restoreDBFromBackup0(long nativeHandle, long backupId, String dbDir, String walDir, long restoreOptions) throws RocksDBException; @@ -162,5 +153,6 @@ private native void deleteBackup0(long nativeHandle, int backupId) private native int[] getCorruptedBackups(long handle); private native void garbageCollect(long handle) throws RocksDBException; - private native void dispose(long nativeHandle); + @Override protected final native void disposeInternal( + final long nativeHandle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java index d98167aeb0..54b0eff28c 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,26 +16,17 @@ public class RestoreOptions extends RocksObject { /** * Constructor * - * @param keepLogFiles If true, restore won't overwrite the existing log files in wal_dir. It - * will also move all log files from archive directory to wal_dir. Use this - * option in combination with BackupableDBOptions::backup_log_files = false - * for persisting in-memory databases. - * Default: false + * @param keepLogFiles If true, restore won't overwrite the existing log files + * in wal_dir. It will also move all log files from archive directory to + * wal_dir. Use this option in combination with + * BackupableDBOptions::backup_log_files = false for persisting in-memory + * databases. + * Default: false */ public RestoreOptions(final boolean keepLogFiles) { - super(); - nativeHandle_ = newRestoreOptions(keepLogFiles); + super(newRestoreOptions(keepLogFiles)); } - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override public synchronized void disposeInternal() { - assert(isInitialized()); - dispose(nativeHandle_); - } - - private native long newRestoreOptions(boolean keepLogFiles); - private native void dispose(long handle); + private native static long newRestoreOptions(boolean keepLogFiles); + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java index 2af55c4201..dd04ce3f5f 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -48,7 +48,8 @@ public static synchronized void loadLibrary() { } catch (IOException e) { - throw new RuntimeException("Unable to load the RocksDB shared library" + e); + throw new RuntimeException("Unable to load the RocksDB shared library" + + e); } } @@ -78,7 +79,8 @@ public static synchronized void loadLibrary(final List paths) { UnsatisfiedLinkError err = null; for (String path : paths) { try { - System.load(path + "/" + Environment.getJniLibraryFileName("rocksdbjni")); + System.load(path + "/" + + Environment.getJniLibraryFileName("rocksdbjni")); success = true; break; } catch (UnsatisfiedLinkError e) { @@ -116,8 +118,8 @@ public static RocksDB open(final String path) throws RocksDBException { * the path to the database using the specified options and db path and a list * of column family names. *

    - * If opened in read write mode every existing column family name must be passed - * within the list to this method.

    + * If opened in read write mode every existing column family name must be + * passed within the list to this method.

    *

    * If opened in read-only mode only a subset of existing column families must * be passed to this method.

    @@ -179,9 +181,7 @@ public static RocksDB open(final Options options, final String path) // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. - RocksDB db = new RocksDB(); - db.open(options.nativeHandle_, path); - + final RocksDB db = new RocksDB(open(options.nativeHandle_, path)); db.storeOptionsInstance(options); return db; } @@ -191,8 +191,8 @@ public static RocksDB open(final Options options, final String path) * the path to the database using the specified options and db path and a list * of column family names. *

    - * If opened in read write mode every existing column family name must be passed - * within the list to this method.

    + * If opened in read write mode every existing column family name must be + * passed within the list to this method.

    *

    * If opened in read-only mode only a subset of existing column families must * be passed to this method.

    @@ -206,7 +206,8 @@ public static RocksDB open(final Options options, final String path) * with new Options instance as underlying native statistics instance does not * use any locks to prevent concurrent updates.

    *

    - * ColumnFamily handles are disposed when the RocksDB instance is disposed.

    + * ColumnFamily handles are disposed when the RocksDB instance is disposed. + *

    * * @param options {@link org.rocksdb.DBOptions} instance. * @param path the path to the rocksdb. @@ -225,13 +226,25 @@ public static RocksDB open(final DBOptions options, final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { - RocksDB db = new RocksDB(); - List cfReferences = db.open(options.nativeHandle_, path, - columnFamilyDescriptors, columnFamilyDescriptors.size()); + + final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; + final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i))); + final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors + .get(i); + cfNames[i] = cfDescriptor.columnFamilyName(); + cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_; } + + final long[] handles = open(options.nativeHandle_, path, cfNames, + cfOptionHandles); + final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); + + for (int i = 1; i < handles.length; i++) { + columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); + } + return db; } @@ -276,7 +289,7 @@ public static RocksDB openReadOnly(final String path, throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. - DBOptions options = new DBOptions(); + final DBOptions options = new DBOptions(); return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles); } @@ -303,9 +316,7 @@ public static RocksDB openReadOnly(final Options options, final String path) // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. - RocksDB db = new RocksDB(); - db.openROnly(options.nativeHandle_, path); - + final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path)); db.storeOptionsInstance(options); return db; } @@ -339,14 +350,25 @@ public static RocksDB openReadOnly(final DBOptions options, final String path, // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. - RocksDB db = new RocksDB(); - List cfReferences = db.openROnly(options.nativeHandle_, path, - columnFamilyDescriptors, columnFamilyDescriptors.size()); - for (int i=0; i listColumnFamilies(final Options options, final String path) throws RocksDBException { - return RocksDB.listColumnFamilies(options.nativeHandle_, path); + return Arrays.asList(RocksDB.listColumnFamilies(options.nativeHandle_, + path)); } private void storeOptionsInstance(DBOptionsInterface options) { options_ = options; } - @Override protected void disposeInternal() { - synchronized (this) { - assert (isInitialized()); - disposeInternal(nativeHandle_); - } - } - - /** - * Close the RocksDB instance. - * This function is equivalent to dispose(). - */ - public void close() { - dispose(); - } - /** * Set the database entry for "key" to "value". * @@ -393,7 +401,8 @@ public void close() { * @throws RocksDBException thrown if error happens in underlying * native library. */ - public void put(final byte[] key, final byte[] value) throws RocksDBException { + public void put(final byte[] key, final byte[] value) + throws RocksDBException { put(nativeHandle_, key, key.length, value, value.length); } @@ -452,8 +461,8 @@ public void put(final WriteOptions writeOpts, final byte[] key, public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { - put(nativeHandle_, writeOpts.nativeHandle_, key, key.length, value, value.length, - columnFamilyHandle.nativeHandle_); + put(nativeHandle_, writeOpts.nativeHandle_, key, key.length, value, + value.length, columnFamilyHandle.nativeHandle_); } /** @@ -469,7 +478,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, * @return boolean value indicating if key does not exist or might exist. */ public boolean keyMayExist(final byte[] key, final StringBuffer value){ - return keyMayExist(key, key.length, value); + return keyMayExist(nativeHandle_, key, key.length, value); } /** @@ -487,8 +496,8 @@ public boolean keyMayExist(final byte[] key, final StringBuffer value){ */ public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final StringBuffer value){ - return keyMayExist(key, key.length, columnFamilyHandle.nativeHandle_, - value); + return keyMayExist(nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_, value); } /** @@ -506,7 +515,7 @@ public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, */ public boolean keyMayExist(final ReadOptions readOptions, final byte[] key, final StringBuffer value){ - return keyMayExist(readOptions.nativeHandle_, + return keyMayExist(nativeHandle_, readOptions.nativeHandle_, key, key.length, value); } @@ -527,7 +536,7 @@ public boolean keyMayExist(final ReadOptions readOptions, public boolean keyMayExist(final ReadOptions readOptions, final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final StringBuffer value){ - return keyMayExist(readOptions.nativeHandle_, + return keyMayExist(nativeHandle_, readOptions.nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_, value); } @@ -543,7 +552,7 @@ public boolean keyMayExist(final ReadOptions readOptions, */ public void write(final WriteOptions writeOpts, final WriteBatch updates) throws RocksDBException { - write0(writeOpts.nativeHandle_, updates.nativeHandle_); + write0(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_); } /** @@ -557,7 +566,7 @@ public void write(final WriteOptions writeOpts, final WriteBatch updates) */ public void write(final WriteOptions writeOpts, final WriteBatchWithIndex updates) throws RocksDBException { - write1(writeOpts.nativeHandle_, updates.nativeHandle_); + write1(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_); } /** @@ -570,7 +579,8 @@ public void write(final WriteOptions writeOpts, * @throws RocksDBException thrown if error happens in underlying * native library. */ - public void merge(final byte[] key, final byte[] value) throws RocksDBException { + public void merge(final byte[] key, final byte[] value) + throws RocksDBException { merge(nativeHandle_, key, key.length, value, value.length); } @@ -745,9 +755,10 @@ public byte[] get(final byte[] key) throws RocksDBException { * @throws RocksDBException thrown if error happens in underlying * native library. */ - public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) - throws RocksDBException { - return get(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + public byte[] get(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException { + return get(nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_); } /** @@ -803,16 +814,16 @@ public Map multiGet(final List keys) throws RocksDBException { assert(keys.size() != 0); - List values = multiGet( - nativeHandle_, keys, keys.size()); + final byte[][] values = multiGet(nativeHandle_, + keys.toArray(new byte[keys.size()][])); Map keyValueMap = new HashMap<>(); - for(int i = 0; i < values.size(); i++) { - if(values.get(i) == null) { + for(int i = 0; i < values.length; i++) { + if(values[i] == null) { continue; } - keyValueMap.put(keys.get(i), values.get(i)); + keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; @@ -836,24 +847,30 @@ public Map multiGet(final List keys) * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. */ - public Map multiGet(final List columnFamilyHandleList, - final List keys) throws RocksDBException, IllegalArgumentException { + public Map multiGet( + final List columnFamilyHandleList, + final List keys) throws RocksDBException, + IllegalArgumentException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. - if (keys.size()!=columnFamilyHandleList.size()) { + if (keys.size() != columnFamilyHandleList.size()) { throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } - List values = multiGet(nativeHandle_, keys, keys.size(), - columnFamilyHandleList); + final long[] cfHandles = new long[columnFamilyHandleList.size()]; + for (int i = 0; i < columnFamilyHandleList.size(); i++) { + cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; + } + final byte[][] values = multiGet(nativeHandle_, + keys.toArray(new byte[keys.size()][]), cfHandles); Map keyValueMap = new HashMap<>(); - for(int i = 0; i < values.size(); i++) { - if (values.get(i) == null) { + for(int i = 0; i < values.length; i++) { + if (values[i] == null) { continue; } - keyValueMap.put(keys.get(i), values.get(i)); + keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } @@ -873,16 +890,16 @@ public Map multiGet(final ReadOptions opt, final List keys) throws RocksDBException { assert(keys.size() != 0); - List values = multiGet( - nativeHandle_, opt.nativeHandle_, keys, keys.size()); + final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_, + keys.toArray(new byte[keys.size()][])); Map keyValueMap = new HashMap<>(); - for(int i = 0; i < values.size(); i++) { - if(values.get(i) == null) { + for(int i = 0; i < values.length; i++) { + if(values[i] == null) { continue; } - keyValueMap.put(keys.get(i), values.get(i)); + keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; @@ -917,16 +934,19 @@ public Map multiGet(final ReadOptions opt, throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } - - List values = multiGet(nativeHandle_, opt.nativeHandle_, - keys, keys.size(), columnFamilyHandleList); + final long[] cfHandles = new long[columnFamilyHandleList.size()]; + for (int i = 0; i < columnFamilyHandleList.size(); i++) { + cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; + } + final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_, + keys.toArray(new byte[keys.size()][]), cfHandles); Map keyValueMap = new HashMap<>(); - for(int i = 0; i < values.size(); i++) { - if(values.get(i) == null) { + for(int i = 0; i < values.length; i++) { + if(values[i] == null) { continue; } - keyValueMap.put(keys.get(i), values.get(i)); + keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; @@ -958,8 +978,8 @@ public void remove(final byte[] key) throws RocksDBException { * @throws RocksDBException thrown if error happens in underlying * native library. */ - public void remove(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) - throws RocksDBException { + public void remove(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException { remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } @@ -1009,8 +1029,9 @@ public void remove(final ColumnFamilyHandle columnFamilyHandle, * *

    Valid property names include: *

      - *
    • "rocksdb.num-files-at-level<N>" - return the number of files at level <N>, - * where <N> is an ASCII representation of a level number (e.g. "0").
    • + *
    • "rocksdb.num-files-at-level<N>" - return the number of files at + * level <N>, where <N> is an ASCII representation of a level + * number (e.g. "0").
    • *
    • "rocksdb.stats" - returns a multi-line string that describes statistics * about the internal operation of the DB.
    • *
    • "rocksdb.sstables" - returns a multi-line string that describes all @@ -1027,8 +1048,8 @@ public void remove(final ColumnFamilyHandle columnFamilyHandle, */ public String getProperty(final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { - return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_, property, - property.length()); + return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_, + property, property.length()); } /** @@ -1039,8 +1060,9 @@ public String getProperty(final ColumnFamilyHandle columnFamilyHandle, * *

      Valid property names include: *

        - *
      • "rocksdb.num-files-at-level<N>" - return the number of files at level <N>, - * where <N> is an ASCII representation of a level number (e.g. "0").
      • + *
      • "rocksdb.num-files-at-level<N>" - return the number of files at + * level <N>, where <N> is an ASCII representation of a level + * number (e.g. "0").
      • *
      • "rocksdb.stats" - returns a multi-line string that describes statistics * about the internal operation of the DB.
      • *
      • "rocksdb.sstables" - returns a multi-line string that describes all @@ -1058,8 +1080,8 @@ public String getProperty(final String property) throws RocksDBException { } /** - *

        Similar to GetProperty(), but only works for a subset of properties whose - * return value is a numerical value. Return the value as long.

        + *

        Similar to GetProperty(), but only works for a subset of properties + * whose return value is a numerical value. Return the value as long.

        * *

        Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative @@ -1084,8 +1106,8 @@ public long getLongProperty(final String property) throws RocksDBException { } /** - *

        Similar to GetProperty(), but only works for a subset of properties whose - * return value is a numerical value. Return the value as long.

        + *

        Similar to GetProperty(), but only works for a subset of properties + * whose return value is a numerical value. Return the value as long.

        * *

        Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative @@ -1109,8 +1131,8 @@ public long getLongProperty(final String property) throws RocksDBException { */ public long getLongProperty(final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { - return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_, property, - property.length()); + return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_, + property, property.length()); } /** @@ -1192,7 +1214,8 @@ public void releaseSnapshot(final Snapshot snapshot) { * instance * @return instance of iterator object. */ - public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle) { + public RocksIterator newIterator( + final ColumnFamilyHandle columnFamilyHandle) { return new RocksIterator(this, iteratorCF(nativeHandle_, columnFamilyHandle.nativeHandle_)); } @@ -1232,7 +1255,8 @@ public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle, * native library. */ public List newIterators( - final List columnFamilyHandleList) throws RocksDBException { + final List columnFamilyHandleList) + throws RocksDBException { return newIterators(columnFamilyHandleList, new ReadOptions()); } @@ -1253,11 +1277,17 @@ public List newIterators( public List newIterators( final List columnFamilyHandleList, final ReadOptions readOptions) throws RocksDBException { - List iterators = - new ArrayList<>(columnFamilyHandleList.size()); - long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList, + final long[] columnFamilyHandles = new long[columnFamilyHandleList.size()]; + for (int i = 0; i < columnFamilyHandleList.size(); i++) { + columnFamilyHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; + } + + final long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandles, readOptions.nativeHandle_); + + final List iterators = new ArrayList<>( + columnFamilyHandleList.size()); for (int i=0; iThe sequence number of the most recent transaction.

        * @@ -1672,26 +1720,55 @@ public TransactionLogIterator getUpdatesSince(final long sequenceNumber) /** * Private constructor. + * + * @param nativeHandle The native handle of the C++ RocksDB object */ - protected RocksDB() { - super(); + protected RocksDB(final long nativeHandle) { + super(nativeHandle); } // native methods - protected native void open( - long optionsHandle, String path) throws RocksDBException; - protected native List open(long optionsHandle, String path, - List columnFamilyDescriptors, - int columnFamilyDescriptorsLength) - throws RocksDBException; - protected native static List listColumnFamilies( - long optionsHandle, String path) throws RocksDBException; - protected native void openROnly( + protected native static long open(final long optionsHandle, + final String path) throws RocksDBException; + + /** + * @param optionsHandle Native handle pointing to an Options object + * @param path The directory path for the database files + * @param columnFamilyNames An array of column family names + * @param columnFamilyOptions An array of native handles pointing to + * ColumnFamilyOptions objects + * + * @return An array of native handles, [0] is the handle of the RocksDB object + * [1..1+n] are handles of the ColumnFamilyReferences + * + * @throws RocksDBException thrown if the database could not be opened + */ + protected native static long[] open(final long optionsHandle, + final String path, final byte[][] columnFamilyNames, + final long[] columnFamilyOptions) throws RocksDBException; + + protected native static long openROnly(final long optionsHandle, + final String path) throws RocksDBException; + + /** + * @param optionsHandle Native handle pointing to an Options object + * @param path The directory path for the database files + * @param columnFamilyNames An array of column family names + * @param columnFamilyOptions An array of native handles pointing to + * ColumnFamilyOptions objects + * + * @return An array of native handles, [0] is the handle of the RocksDB object + * [1..1+n] are handles of the ColumnFamilyReferences + * + * @throws RocksDBException thrown if the database could not be opened + */ + protected native static long[] openROnly(final long optionsHandle, + final String path, final byte[][] columnFamilyNames, + final long[] columnFamilyOptions + ) throws RocksDBException; + + protected native static byte[][] listColumnFamilies( long optionsHandle, String path) throws RocksDBException; - protected native List openROnly( - long optionsHandle, String path, - List columnFamilyDescriptors, - int columnFamilyDescriptorsLength) throws RocksDBException; protected native void put( long handle, byte[] key, int keyLen, byte[] value, int valueLen) throws RocksDBException; @@ -1706,18 +1783,20 @@ protected native void put( long handle, long writeOptHandle, byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle) throws RocksDBException; - protected native void write0( - long writeOptHandle, long wbHandle) throws RocksDBException; - protected native void write1( - long writeOptHandle, long wbwiHandle) throws RocksDBException; - protected native boolean keyMayExist(byte[] key, int keyLen, - StringBuffer stringBuffer); - protected native boolean keyMayExist(byte[] key, int keyLen, - long cfHandle, StringBuffer stringBuffer); - protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen, - StringBuffer stringBuffer); - protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen, - long cfHandle, StringBuffer stringBuffer); + protected native void write0(final long handle, long writeOptHandle, + long wbHandle) throws RocksDBException; + protected native void write1(final long handle, long writeOptHandle, + long wbwiHandle) throws RocksDBException; + protected native boolean keyMayExist(final long handle, final byte[] key, + final int keyLen, final StringBuffer stringBuffer); + protected native boolean keyMayExist(final long handle, final byte[] key, + final int keyLen, final long cfHandle, final StringBuffer stringBuffer); + protected native boolean keyMayExist(final long handle, + final long optionsHandle, final byte[] key, final int keyLen, + final StringBuffer stringBuffer); + protected native boolean keyMayExist(final long handle, + final long optionsHandle, final byte[] key, final int keyLen, + final long cfHandle, final StringBuffer stringBuffer); protected native void merge( long handle, byte[] key, int keyLen, byte[] value, int valueLen) throws RocksDBException; @@ -1744,20 +1823,18 @@ protected native int get( protected native int get( long handle, long readOptHandle, byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle) throws RocksDBException; - protected native List multiGet( - long dbHandle, List keys, int keysCount); - protected native List multiGet( - long dbHandle, List keys, int keysCount, List - cfHandles); - protected native List multiGet( - long dbHandle, long rOptHandle, List keys, int keysCount); - protected native List multiGet( - long dbHandle, long rOptHandle, List keys, int keysCount, - List cfHandles); + protected native byte[][] multiGet(final long dbHandle, final byte[][] keys); + protected native byte[][] multiGet(final long dbHandle, final byte[][] keys, + final long[] columnFamilyHandles); + protected native byte[][] multiGet(final long dbHandle, final long rOptHandle, + final byte[][] keys); + protected native byte[][] multiGet(final long dbHandle, final long rOptHandle, + final byte[][] keys, final long[] columnFamilyHandles); protected native byte[] get( long handle, byte[] key, int keyLen) throws RocksDBException; protected native byte[] get( - long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException; + long handle, byte[] key, int keyLen, long cfHandle) + throws RocksDBException; protected native byte[] get( long handle, long readOptHandle, byte[] key, int keyLen) throws RocksDBException; @@ -1767,7 +1844,8 @@ protected native byte[] get( protected native void remove( long handle, byte[] key, int keyLen) throws RocksDBException; protected native void remove( - long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException; + long handle, byte[] key, int keyLen, long cfHandle) + throws RocksDBException; protected native void remove( long handle, long writeOptHandle, byte[] key, int keyLen) throws RocksDBException; @@ -1787,34 +1865,38 @@ protected native long getLongProperty(long nativeHandle, long cfHandle, protected native long iteratorCF(long handle, long cfHandle); protected native long iteratorCF(long handle, long cfHandle, long readOptHandle); - protected native long[] iterators(long handle, - List columnFamilyNames, long readOptHandle) + protected native long[] iterators(final long handle, + final long[] columnFamilyHandles, final long readOptHandle) throws RocksDBException; protected native long getSnapshot(long nativeHandle); protected native void releaseSnapshot( long nativeHandle, long snapshotHandle); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); private native long getDefaultColumnFamily(long handle); - private native long createColumnFamily(long handle, - ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException; - private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException; + private native long createColumnFamily(final long handle, + final byte[] columnFamilyName, final long columnFamilyOptions) + throws RocksDBException; + private native void dropColumnFamily(long handle, long cfHandle) + throws RocksDBException; private native void flush(long handle, long flushOptHandle) throws RocksDBException; private native void flush(long handle, long flushOptHandle, long cfHandle) throws RocksDBException; - private native void compactRange0(long handle, boolean reduce_level, int target_level, + private native void compactRange0(long handle, boolean reduce_level, + int target_level, int target_path_id) throws RocksDBException; + private native void compactRange0(long handle, byte[] begin, int beginLen, + byte[] end, int endLen, boolean reduce_level, int target_level, int target_path_id) throws RocksDBException; - private native void compactRange0(long handle, byte[] begin, int beginLen, byte[] end, - int endLen, boolean reduce_level, int target_level, int target_path_id) + private native void compactRange(long handle, boolean reduce_level, + int target_level, int target_path_id, long cfHandle) throws RocksDBException; - private native void compactRange(long handle, boolean reduce_level, int target_level, + private native void compactRange(long handle, byte[] begin, int beginLen, + byte[] end, int endLen, boolean reduce_level, int target_level, int target_path_id, long cfHandle) throws RocksDBException; - private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end, - int endLen, boolean reduce_level, int target_level, int target_path_id, - long cfHandle) throws RocksDBException; + private native void pauseBackgroundWork(long handle) throws RocksDBException; + private native void continueBackgroundWork(long handle) throws RocksDBException; private native long getLatestSequenceNumber(long handle); - private native void disableFileDeletions(long handle) - throws RocksDBException; + private native void disableFileDeletions(long handle) throws RocksDBException; private native void enableFileDeletions(long handle, boolean force) throws RocksDBException; private native long getUpdatesSince(long handle, long sequenceNumber) diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java index a65d40124a..ee869f20ff 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java index 4c399eafa0..72dc22c42c 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,8 +24,7 @@ public class RocksEnv extends Env { * {@code dispose()} of the created RocksEnv will be no-op.

        */ RocksEnv(final long handle) { - super(); - nativeHandle_ = handle; + super(handle); disOwnNativeHandle(); } @@ -38,6 +37,7 @@ public class RocksEnv extends Env { * RocksEnv with RocksJava. The default env allocation is managed * by C++.

        */ - @Override protected void disposeInternal() { + @Override + protected final void disposeInternal(final long handle) { } } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java index bb9a6e697f..42e2460cf1 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,7 +33,7 @@ protected RocksIterator(RocksDB rocksDB, long nativeHandle) { * @return key for the current entry. */ public byte[] key() { - assert(isInitialized()); + assert(isOwningHandle()); return key0(nativeHandle_); } @@ -46,11 +46,11 @@ public byte[] key() { * @return value for the current entry. */ public byte[] value() { - assert(isInitialized()); + assert(isOwningHandle()); return value0(nativeHandle_); } - @Override final native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); @Override final native boolean isValid0(long handle); @Override final native void seekToFirst0(long handle); @Override final native void seekToLast0(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java index fce8fe314a..3ac74a90a2 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java index 54c9f99818..d7854eae17 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,15 +19,9 @@ public class RocksMemEnv extends Env { *

        {@code *base_env} must remain live while the result is in use.

        */ public RocksMemEnv() { - super(); - nativeHandle_ = createMemEnv(); - } - - @Override - protected void disposeInternal() { - disposeInternal(nativeHandle_); + super(createMemEnv()); } private static native long createMemEnv(); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java new file mode 100644 index 0000000000..9b9e576dce --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java @@ -0,0 +1,69 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * RocksMutableObject is an implementation of {@link AbstractNativeReference} + * whose reference to the underlying native C++ object can change. + * + *

        The use of {@code RocksMutableObject} should be kept to a minimum, as it + * has synchronization overheads and introduces complexity. Instead it is + * recommended to use {@link RocksObject} where possible.

        + */ +public abstract class RocksMutableObject extends AbstractNativeReference { + + /** + * An mutable reference to the value of the C++ pointer pointing to some + * underlying native RocksDB C++ object. + */ + private long nativeHandle_; + private boolean owningHandle_; + + protected RocksMutableObject() { + } + + protected RocksMutableObject(final long nativeHandle) { + this.nativeHandle_ = nativeHandle; + this.owningHandle_ = true; + } + + public synchronized void setNativeHandle(final long nativeHandle, + final boolean owningNativeHandle) { + this.nativeHandle_ = nativeHandle; + this.owningHandle_ = owningNativeHandle; + } + + @Override + protected synchronized boolean isOwningHandle() { + return this.owningHandle_; + } + + /** + * Gets the value of the C++ pointer pointing to the underlying + * native C++ object + * + * @return the pointer value for the native object + */ + protected synchronized long getNativeHandle() { + assert (this.nativeHandle_ != 0); + return this.nativeHandle_; + } + + @Override + public synchronized final void close() { + if (isOwningHandle()) { + disposeInternal(); + this.owningHandle_ = false; + this.nativeHandle_ = 0; + } + } + + protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + protected abstract void disposeInternal(final long handle); +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java b/external/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java index 6e24a13852..2a35852c5e 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,120 +6,36 @@ package org.rocksdb; /** - * RocksObject is the base-class of all RocksDB classes that has a pointer to - * some c++ {@code rocksdb} object. - * + * RocksObject is an implementation of {@link AbstractNativeReference} which + * has an immutable and therefore thread-safe reference to the underlying + * native C++ RocksDB object. *

        - * RocksObject has {@code dispose()} function, which releases its associated c++ - * resource.

        + * RocksObject is the base-class of almost all RocksDB classes that have a + * pointer to some underlying native C++ {@code rocksdb} object.

        *

        - * This function can be either called manually, or being called automatically - * during the regular Java GC process. However, since Java may wrongly assume a - * RocksObject only contains a long member variable and think it is small in size, - * Java may give {@code RocksObject} low priority in the GC process. For this, it is - * suggested to call {@code dispose()} manually. However, it is safe to let - * {@code RocksObject} go out-of-scope without manually calling {@code dispose()} - * as {@code dispose()} will be called in the finalizer during the - * regular GC process.

        + * The use of {@code RocksObject} should always be preferred over + * {@link RocksMutableObject}.

        */ -public abstract class RocksObject { - protected RocksObject() { - nativeHandle_ = 0; - owningHandle_ = true; - } - - /** - * Release the c++ object manually pointed by the native handle. - *

        - * Note that {@code dispose()} will also be called during the GC process - * if it was not called before its {@code RocksObject} went out-of-scope. - * However, since Java may wrongly wrongly assume those objects are - * small in that they seems to only hold a long variable. As a result, - * they might have low priority in the GC process. To prevent this, - * it is suggested to call {@code dispose()} manually. - *

        - *

        - * Note that once an instance of {@code RocksObject} has been disposed, - * calling its function will lead undefined behavior. - *

        - */ - public final synchronized void dispose() { - if (isOwningNativeHandle() && isInitialized()) { - disposeInternal(); - } - nativeHandle_ = 0; - disOwnNativeHandle(); - } - - /** - * The helper function of {@code dispose()} which all subclasses of - * {@code RocksObject} must implement to release their associated - * C++ resource. - */ - protected abstract void disposeInternal(); - - /** - * Revoke ownership of the native object. - *

        - * This will prevent the object from attempting to delete the underlying - * native object in its finalizer. This must be used when another object - * takes over ownership of the native object or both will attempt to delete - * the underlying object when garbage collected. - *

        - * When {@code disOwnNativeHandle()} is called, {@code dispose()} will simply set - * {@code nativeHandle_} to 0 without releasing its associated C++ resource. - * As a result, incorrectly use this function may cause memory leak, and this - * function call will not affect the return value of {@code isInitialized()}. - *

        - * @see #dispose() - * @see #isInitialized() - */ - protected void disOwnNativeHandle() { - owningHandle_ = false; - } +public abstract class RocksObject extends AbstractImmutableNativeReference { /** - * Returns true if the current {@code RocksObject} is responsible to release - * its native handle. - * - * @return true if the current {@code RocksObject} is responsible to release - * its native handle. - * - * @see #disOwnNativeHandle() - * @see #dispose() + * An immutable reference to the value of the C++ pointer pointing to some + * underlying native RocksDB C++ object. */ - protected boolean isOwningNativeHandle() { - return owningHandle_; - } + protected final long nativeHandle_; - /** - * Returns true if the associated native handle has been initialized. - * - * @return true if the associated native handle has been initialized. - * - * @see #dispose() - */ - protected boolean isInitialized() { - return (nativeHandle_ != 0); + protected RocksObject(final long nativeHandle) { + super(true); + this.nativeHandle_ = nativeHandle; } /** - * Simply calls {@code dispose()} and release its c++ resource if it has not - * yet released. + * Deletes underlying C++ object pointer. */ - @Override protected void finalize() throws Throwable { - dispose(); - super.finalize(); + @Override + protected void disposeInternal() { + disposeInternal(nativeHandle_); } - /** - * A long variable holding c++ pointer pointing to some RocksDB C++ object. - */ - protected long nativeHandle_; - - /** - * A flag indicating whether the current {@code RocksObject} is responsible to - * release the c++ object stored in its {@code nativeHandle_}. - */ - private boolean owningHandle_; + protected abstract void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Slice.java b/external/rocksdb/java/src/main/java/org/rocksdb/Slice.java index d26490e5f8..cbb9742a37 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Slice.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Slice.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -29,7 +29,6 @@ public class Slice extends AbstractSlice { */ private Slice() { super(); - disOwnNativeHandle(); } /** @@ -39,8 +38,7 @@ private Slice() { * @param str String value. */ public Slice(final String str) { - super(); - createNewSliceFromString(str); + super(createNewSliceFromString(str)); } /** @@ -51,8 +49,7 @@ public Slice(final String str) { * @param offset offset within the byte array. */ public Slice(final byte[] data, final int offset) { - super(); - createNewSlice0(data, offset); + super(createNewSlice0(data, offset)); } /** @@ -62,8 +59,7 @@ public Slice(final byte[] data, final int offset) { * @param data byte array. */ public Slice(final byte[] data) { - super(); - createNewSlice1(data); + super(createNewSlice1(data)); } /** @@ -77,12 +73,14 @@ public Slice(final byte[] data) { */ @Override protected void disposeInternal() { - disposeInternalBuf(nativeHandle_); - super.disposeInternal(); + final long nativeHandle = getNativeHandle(); + disposeInternalBuf(nativeHandle); + super.disposeInternal(nativeHandle); } @Override protected final native byte[] data0(long handle); - private native void createNewSlice0(byte[] data, int length); - private native void createNewSlice1(byte[] data); - private native void disposeInternalBuf(long handle); + private native static long createNewSlice0(final byte[] data, + final int length); + private native static long createNewSlice1(final byte[] data); + private native void disposeInternalBuf(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java b/external/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java index 7ef5c383df..8475ec9951 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,8 +10,7 @@ */ public class Snapshot extends RocksObject { Snapshot(final long nativeHandle) { - super(); - nativeHandle_ = nativeHandle; + super(nativeHandle); } /** @@ -21,7 +20,7 @@ public class Snapshot extends RocksObject { * this snapshot. */ public long getSequenceNumber() { - assert(isInitialized()); + assert(isOwningHandle()); return getSequenceNumber(nativeHandle_); } @@ -30,7 +29,8 @@ public long getSequenceNumber() { * to the snapshot is released by the database * instance. */ - @Override protected void disposeInternal() { + @Override + protected final void disposeInternal(final long handle) { } private native long getSequenceNumber(long handle); diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/Statistics.java b/external/rocksdb/java/src/main/java/org/rocksdb/Statistics.java index a099444f48..858637763a 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/Statistics.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/Statistics.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,9 +22,9 @@ public long getTickerCount(TickerType tickerType) { return getTickerCount0(tickerType.getValue(), statsHandle_); } - public HistogramData geHistogramData(final HistogramType histogramType) { + public HistogramData getHistogramData(final HistogramType histogramType) { assert(isInitialized()); - return geHistogramData0( + return getHistogramData0( histogramType.getValue(), statsHandle_); } @@ -33,5 +33,5 @@ private boolean isInitialized() { } private native long getTickerCount0(int tickerType, long handle); - private native HistogramData geHistogramData0(int histogramType, long handle); + private native HistogramData getHistogramData0(int histogramType, long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java b/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java index 4f1577ca71..246826d326 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -86,7 +86,7 @@ public void run() { // Collect histogram data for(HistogramType histogramType : HistogramType.values()) { HistogramData histogramData = - statistics.geHistogramData(histogramType); + statistics.getHistogramData(histogramType); statsCallback.histogramCallback(histogramType, histogramData); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java b/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java index 2ce92c5ee1..18f81790e5 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java b/external/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java index 0e842c256d..a3acede3fc 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java b/external/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java index 58a533b22a..29cd262c2c 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/TickerType.java b/external/rocksdb/java/src/main/java/org/rocksdb/TickerType.java index 180fbf4a66..9ff819a202 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/TickerType.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/TickerType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java b/external/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java index 36f7e2cdf4..f9684bd72e 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java @@ -57,12 +57,7 @@ public BatchResult getBatch() { * @param nativeHandle address to native address. */ TransactionLogIterator(final long nativeHandle) { - super(); - nativeHandle_ = nativeHandle; - } - - @Override protected void disposeInternal() { - disposeInternal(nativeHandle_); + super(nativeHandle); } /** @@ -107,7 +102,7 @@ public WriteBatch writeBatch() { private final WriteBatch writeBatch_; } - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); private native boolean isValid(long handle); private native void next(long handle); private native void status(long handle) diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java b/external/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java index de6dea9a50..72704893c2 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -84,9 +84,7 @@ public static TtlDB open(final Options options, final String db_path) */ public static TtlDB open(final Options options, final String db_path, final int ttl, final boolean readOnly) throws RocksDBException { - TtlDB ttldb = new TtlDB(); - ttldb.open(options.nativeHandle_, db_path, ttl, readOnly); - return ttldb; + return new TtlDB(open(options.nativeHandle_, db_path, ttl, readOnly)); } /** @@ -114,15 +112,29 @@ public static TtlDB open(final DBOptions options, final String db_path, final List ttlValues, final boolean readOnly) throws RocksDBException { if (columnFamilyDescriptors.size() != ttlValues.size()) { - throw new IllegalArgumentException("There must be a ttl value per column" + - "family handle."); + throw new IllegalArgumentException("There must be a ttl value per column" + + "family handle."); } - TtlDB ttlDB = new TtlDB(); - List cfReferences = ttlDB.openCF(options.nativeHandle_, db_path, - columnFamilyDescriptors, columnFamilyDescriptors.size(), - ttlValues, readOnly); - for (int i=0; i */ - @Override public synchronized void close() { - if (isInitialized()) { + @Override + public void close() { super.close(); - } } /** @@ -175,23 +186,26 @@ public ColumnFamilyHandle createColumnFamilyWithTtl( * {@link #open(DBOptions, String, java.util.List, java.util.List, * java.util.List, boolean)}. *

        + * + * @param nativeHandle The native handle of the C++ TtlDB object */ - protected TtlDB() { - super(); + protected TtlDB(final long nativeHandle) { + super(nativeHandle); } @Override protected void finalize() throws Throwable { - close(); + close(); //TODO(AR) revisit here when implementing AutoCloseable super.finalize(); } - private native void open(long optionsHandle, String db_path, int ttl, - boolean readOnly) throws RocksDBException; - private native List openCF(long optionsHandle, String db_path, - List columnFamilyDescriptors, - int columnFamilyDescriptorsLength, List ttlValues, - boolean readOnly) throws RocksDBException; - private native long createColumnFamilyWithTtl(long handle, - ColumnFamilyDescriptor columnFamilyDescriptor, int ttl) + private native static long open(final long optionsHandle, + final String db_path, final int ttl, final boolean readOnly) + throws RocksDBException; + private native static long[] openCF(final long optionsHandle, + final String db_path, final byte[][] columnFamilyNames, + final long[] columnFamilyOptions, final int[] ttlValues, + final boolean readOnly) throws RocksDBException; + private native long createColumnFamilyWithTtl(final long handle, + final byte[] columnFamilyName, final long columnFamilyOptions, int ttl) throws RocksDBException; } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/external/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java index f42f5498bb..7eb019d0b2 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java @@ -1,14 +1,16 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. package org.rocksdb; -public class WBWIRocksIterator extends AbstractRocksIterator { +public class WBWIRocksIterator + extends AbstractRocksIterator { private final WriteEntry entry = new WriteEntry(); - protected WBWIRocksIterator(final WriteBatchWithIndex wbwi, final long nativeHandle) { + protected WBWIRocksIterator(final WriteBatchWithIndex wbwi, + final long nativeHandle) { super(wbwi, nativeHandle); } @@ -20,16 +22,24 @@ protected WBWIRocksIterator(final WriteBatchWithIndex wbwi, final long nativeHan * If you want to keep the WriteEntry across iterator * movements, you must make a copy of its data! * + * Note - This method is not thread-safe with respect to the WriteEntry + * as it performs a non-atomic update across the fields of the WriteEntry + * * @return The WriteEntry of the current entry */ public WriteEntry entry() { - assert(isInitialized()); + assert(isOwningHandle()); assert(entry != null); - entry1(nativeHandle_, entry); + final long ptrs[] = entry1(nativeHandle_); + + entry.type = WriteType.fromId((byte)ptrs[0]); + entry.key.setNativeHandle(ptrs[1], true); + entry.value.setNativeHandle(ptrs[2], ptrs[2] != 0); + return entry; } - @Override final native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); @Override final native boolean isValid0(long handle); @Override final native void seekToFirst0(long handle); @Override final native void seekToLast0(long handle); @@ -38,17 +48,31 @@ public WriteEntry entry() { @Override final native void seek0(long handle, byte[] target, int targetLen); @Override final native void status0(long handle) throws RocksDBException; - private native void entry1(long handle, WriteEntry entry); + private native long[] entry1(final long handle); /** * Enumeration of the Write operation * that created the record in the Write Batch */ public enum WriteType { - PUT, - MERGE, - DELETE, - LOG + PUT((byte)0x1), + MERGE((byte)0x2), + DELETE((byte)0x4), + LOG((byte)0x8); + + final byte id; + WriteType(final byte id) { + this.id = id; + } + + public static WriteType fromId(final byte id) { + for(final WriteType wt : WriteType.values()) { + if(id == wt.id) { + return wt; + } + } + throw new IllegalArgumentException("No WriteType with id=" + id); + } } /** @@ -110,7 +134,7 @@ public DirectSlice getKey() { * no value */ public DirectSlice getValue() { - if(!value.isInitialized()) { + if(!value.isOwningHandle()) { return null; //TODO(AR) migrate to JDK8 java.util.Optional#empty() } else { return value; @@ -139,8 +163,7 @@ public boolean equals(Object other) { final WriteEntry otherWriteEntry = (WriteEntry)other; return type.equals(otherWriteEntry.type) && key.equals(otherWriteEntry.key) - && (value.isInitialized() ? value.equals(otherWriteEntry.value) - : !otherWriteEntry.value.isInitialized()); + && value.equals(otherWriteEntry.value); } else { return false; } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java b/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java index 960d122e2d..325f9c0573 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -27,8 +27,7 @@ public class WriteBatch extends AbstractWriteBatch { * Constructs a WriteBatch instance. */ public WriteBatch() { - super(); - newWriteBatch(0); + this(0); } /** @@ -37,8 +36,7 @@ public WriteBatch() { * @param reserved_bytes reserved size for WriteBatch */ public WriteBatch(final int reserved_bytes) { - nativeHandle_ = 0; - newWriteBatch(reserved_bytes); + super(newWriteBatch(reserved_bytes)); } /** @@ -50,7 +48,7 @@ public WriteBatch(final int reserved_bytes) { * @throws RocksDBException If we cannot iterate over the batch */ public void iterate(final Handler handler) throws RocksDBException { - iterate(handler.nativeHandle_); + iterate(nativeHandle_, handler.nativeHandle_); } /** @@ -61,35 +59,46 @@ public void iterate(final Handler handler) throws RocksDBException { * @param nativeHandle address of native instance. */ WriteBatch(final long nativeHandle) { - super(); + super(nativeHandle); disOwnNativeHandle(); - nativeHandle_ = nativeHandle; } - @Override final native void disposeInternal(long handle); - @Override final native int count0(); - @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen); - @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen, - long cfHandle); - @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen); - @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen, - long cfHandle); - @Override final native void remove(byte[] key, int keyLen); - @Override final native void remove(byte[] key, int keyLen, long cfHandle); - @Override final native void putLogData(byte[] blob, int blobLen); - @Override final native void clear0(); + @Override protected final native void disposeInternal(final long handle); + @Override final native int count0(final long handle); + @Override final native void put(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen); + @Override final native void put(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen, + final long cfHandle); + @Override final native void merge(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen); + @Override final native void merge(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen, + final long cfHandle); + @Override final native void remove(final long handle, final byte[] key, + final int keyLen); + @Override final native void remove(final long handle, final byte[] key, + final int keyLen, final long cfHandle); + @Override final native void putLogData(final long handle, + final byte[] blob, final int blobLen); + @Override final native void clear0(final long handle); + @Override final native void setSavePoint0(final long handle); + @Override final native void rollbackToSavePoint0(final long handle); - private native void newWriteBatch(int reserved_bytes); - private native void iterate(long handlerHandle) throws RocksDBException; + private native static long newWriteBatch(final int reserved_bytes); + private native void iterate(final long handle, final long handlerHandle) + throws RocksDBException; /** * Handler callback for iterating over the contents of a batch. */ - public static abstract class Handler extends RocksObject { + public static abstract class Handler + extends AbstractImmutableNativeReference { + private final long nativeHandle_; public Handler() { - super(); - createNewHandler0(); + super(true); + this.nativeHandle_ = createNewHandler0(); } public abstract void put(byte[] key, byte[] value); @@ -116,11 +125,10 @@ public boolean shouldContinue() { */ @Override protected void disposeInternal() { - assert(isInitialized()); disposeInternal(nativeHandle_); } - private native void createNewHandler0(); - private native void disposeInternal(long handle); + private native long createNewHandler0(); + private native void disposeInternal(final long handle); } } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java index d5c24ec3aa..a077918519 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -95,4 +95,19 @@ void merge(ColumnFamilyHandle columnFamilyHandle, * Clear all updates buffered in this batch */ void clear(); + + /** + * Records the state of the batch for future calls to RollbackToSavePoint(). + * May be called multiple times to set multiple save points. + */ + void setSavePoint(); + + /** + * Remove all entries in this batch (Put, Merge, Delete, PutLogData) since + * the most recent call to SetSavePoint() and removes the most recent save + * point. + * + * @throws RocksDBException if there is no previous call to SetSavePoint() + */ + void rollbackToSavePoint() throws RocksDBException; } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java index bde037bc38..dad908a246 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,10 +12,10 @@ * Calling put, merge, remove or putLogData calls the same function * as with {@link org.rocksdb.WriteBatch} whilst also building an index. * - * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator() }to create an iterator - * over the write batch or - * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)} to - * get an iterator for the database with Read-Your-Own-Writes like capability + * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator()} to + * create an iterator over the write batch or + * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)} + * to get an iterator for the database with Read-Your-Own-Writes like capability */ public class WriteBatchWithIndex extends AbstractWriteBatch { /** @@ -25,8 +25,7 @@ public class WriteBatchWithIndex extends AbstractWriteBatch { * and duplicate keys operations are retained */ public WriteBatchWithIndex() { - super(); - newWriteBatchWithIndex(); + super(newWriteBatchWithIndex()); } @@ -41,8 +40,7 @@ public WriteBatchWithIndex() { * show two entries with the same key. */ public WriteBatchWithIndex(final boolean overwriteKey) { - super(); - newWriteBatchWithIndex(overwriteKey); + super(newWriteBatchWithIndex(overwriteKey)); } /** @@ -58,10 +56,12 @@ public WriteBatchWithIndex(final boolean overwriteKey) { * inserting a duplicate key, in this way an iterator will never * show two entries with the same key. */ - public WriteBatchWithIndex(final AbstractComparator> - fallbackIndexComparator, final int reservedBytes, final boolean overwriteKey) { - super(); - newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_, reservedBytes, overwriteKey); + public WriteBatchWithIndex( + final AbstractComparator> + fallbackIndexComparator, final int reservedBytes, + final boolean overwriteKey) { + super(newWriteBatchWithIndex(fallbackIndexComparator.getNativeHandle(), + reservedBytes, overwriteKey)); } /** @@ -73,10 +73,13 @@ public WriteBatchWithIndex(final AbstractComparator> * time. * * @param columnFamilyHandle The column family to iterate over - * @return An iterator for the Write Batch contents, restricted to the column family + * @return An iterator for the Write Batch contents, restricted to the column + * family */ - public WBWIRocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle) { - return new WBWIRocksIterator(this, iterator1(columnFamilyHandle.nativeHandle_)); + public WBWIRocksIterator newIterator( + final ColumnFamilyHandle columnFamilyHandle) { + return new WBWIRocksIterator(this, iterator1(nativeHandle_, + columnFamilyHandle.nativeHandle_)); } /** @@ -90,7 +93,7 @@ public WBWIRocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle * @return An iterator for the Write Batch contents */ public WBWIRocksIterator newIterator() { - return new WBWIRocksIterator(this, iterator0()); + return new WBWIRocksIterator(this, iterator0(nativeHandle_)); } /** @@ -99,15 +102,19 @@ public WBWIRocksIterator newIterator() { * as a delta and baseIterator as a base * * @param columnFamilyHandle The column family to iterate over - * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()} - * @return An iterator which shows a view comprised of both the database point-in-time - * from baseIterator and modifications made in this write batch. + * @param baseIterator The base iterator, + * e.g. {@link org.rocksdb.RocksDB#newIterator()} + * @return An iterator which shows a view comprised of both the database + * point-in-time from baseIterator and modifications made in this write batch. */ - public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHandle, + public RocksIterator newIteratorWithBase( + final ColumnFamilyHandle columnFamilyHandle, final RocksIterator baseIterator) { RocksIterator iterator = new RocksIterator( baseIterator.parent_, - iteratorWithBase(columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_)); + iteratorWithBase(nativeHandle_, + columnFamilyHandle.nativeHandle_, + baseIterator.nativeHandle_)); //when the iterator is deleted it will also delete the baseIterator baseIterator.disOwnNativeHandle(); return iterator; @@ -116,34 +123,48 @@ public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHa /** * Provides Read-Your-Own-Writes like functionality by * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} - * as a delta and baseIterator as a base. Operates on the default column family. + * as a delta and baseIterator as a base. Operates on the default column + * family. * - * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()} - * @return An iterator which shows a view comprised of both the database point-in-time - * from baseIterator and modifications made in this write batch. + * @param baseIterator The base iterator, + * e.g. {@link org.rocksdb.RocksDB#newIterator()} + * @return An iterator which shows a view comprised of both the database + * point-in-timefrom baseIterator and modifications made in this write batch. */ public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) { - return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator); + return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), + baseIterator); } - @Override final native void disposeInternal(long handle); - @Override final native int count0(); - @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen); - @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen, - long cfHandle); - @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen); - @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen, - long cfHandle); - @Override final native void remove(byte[] key, int keyLen); - @Override final native void remove(byte[] key, int keyLen, long cfHandle); - @Override final native void putLogData(byte[] blob, int blobLen); - @Override final native void clear0(); + @Override protected final native void disposeInternal(final long handle); + @Override final native int count0(final long handle); + @Override final native void put(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen); + @Override final native void put(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen, + final long cfHandle); + @Override final native void merge(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen); + @Override final native void merge(final long handle, final byte[] key, + final int keyLen, final byte[] value, final int valueLen, + final long cfHandle); + @Override final native void remove(final long handle, final byte[] key, + final int keyLen); + @Override final native void remove(final long handle, final byte[] key, + final int keyLen, final long cfHandle); + @Override final native void putLogData(final long handle, final byte[] blob, + final int blobLen); + @Override final native void clear0(final long handle); + @Override final native void setSavePoint0(final long handle); + @Override final native void rollbackToSavePoint0(final long handle); - private native void newWriteBatchWithIndex(); - private native void newWriteBatchWithIndex(boolean overwriteKey); - private native void newWriteBatchWithIndex(long fallbackIndexComparatorHandle, int reservedBytes, - boolean overwriteKey); - private native long iterator0(); - private native long iterator1(long cfHandle); - private native long iteratorWithBase(long baseIteratorHandle, long cfHandle); + private native static long newWriteBatchWithIndex(); + private native static long newWriteBatchWithIndex(final boolean overwriteKey); + private native static long newWriteBatchWithIndex( + final long fallbackIndexComparatorHandle, final int reservedBytes, + final boolean overwriteKey); + private native long iterator0(final long handle); + private native long iterator1(final long handle, final long cfHandle); + private native long iteratorWithBase(final long handle, + final long baseIteratorHandle, final long cfHandle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java b/external/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java index c27dc9b3cd..4e7abd873a 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,13 +16,8 @@ public class WriteOptions extends RocksObject { * Construct WriteOptions instance. */ public WriteOptions() { - super(); - newWriteOptions(); - } + super(newWriteOptions()); - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); } /** @@ -97,10 +92,10 @@ public boolean disableWAL() { return disableWAL(nativeHandle_); } - private native void newWriteOptions(); + private native static long newWriteOptions(); private native void setSync(long handle, boolean flag); private native boolean sync(long handle); private native void setDisableWAL(long handle, boolean flag); private native boolean disableWAL(long handle); - private native void disposeInternal(long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java new file mode 100644 index 0000000000..17337bfc85 --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.util; + +import org.rocksdb.*; + +import java.nio.ByteBuffer; + +/** + * This is a Java Native implementation of the C++ + * equivalent BytewiseComparatorImpl using {@link Slice} + * + * The performance of Comparators implemented in Java is always + * less than their C++ counterparts due to the bridging overhead, + * as such you likely don't want to use this apart from benchmarking + * and you most likely instead wanted + * {@link org.rocksdb.BuiltinComparator#BYTEWISE_COMPARATOR} + */ +public class BytewiseComparator extends Comparator { + + public BytewiseComparator(final ComparatorOptions copt) { + super(copt); + } + + @Override + public String name() { + return "rocksdb.java.BytewiseComparator"; + } + + @Override + public int compare(final Slice a, final Slice b) { + return compare(a.data(), b.data()); + } + + @Override + public String findShortestSeparator(final String start, + final Slice limit) { + final byte[] startBytes = start.getBytes(); + final byte[] limitBytes = limit.data(); + + // Find length of common prefix + final int min_length = Math.min(startBytes.length, limit.size()); + int diff_index = 0; + while ((diff_index < min_length) && + (startBytes[diff_index] == limitBytes[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + final byte diff_byte = startBytes[diff_index]; + if(diff_byte < 0xff && diff_byte + 1 < limitBytes[diff_index]) { + final byte shortest[] = new byte[diff_index + 1]; + System.arraycopy(startBytes, 0, shortest, 0, diff_index + 1); + shortest[diff_index]++; + assert(compare(shortest, limitBytes) < 0); + return new String(shortest); + } + } + + return null; + } + + private static int compare(final byte[] a, final byte[] b) { + return ByteBuffer.wrap(a).compareTo(ByteBuffer.wrap(b)); + } + + @Override + public String findShortSuccessor(final String key) { + final byte[] keyBytes = key.getBytes(); + + // Find first character that can be incremented + final int n = keyBytes.length; + for (int i = 0; i < n; i++) { + final byte byt = keyBytes[i]; + if (byt != 0xff) { + final byte shortSuccessor[] = new byte[i + 1]; + System.arraycopy(keyBytes, 0, shortSuccessor, 0, i + 1); + shortSuccessor[i]++; + return new String(shortSuccessor); + } + } + // *key is a run of 0xffs. Leave it alone. + + return null; + } +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/util/DirectBytewiseComparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/util/DirectBytewiseComparator.java new file mode 100644 index 0000000000..170f0f42e4 --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/util/DirectBytewiseComparator.java @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.util; + +import org.rocksdb.ComparatorOptions; +import org.rocksdb.DirectComparator; +import org.rocksdb.DirectSlice; + +import java.nio.ByteBuffer; + +/** + * This is a Java Native implementation of the C++ + * equivalent BytewiseComparatorImpl using {@link DirectSlice} + * + * The performance of Comparators implemented in Java is always + * less than their C++ counterparts due to the bridging overhead, + * as such you likely don't want to use this apart from benchmarking + * and you most likely instead wanted + * {@link org.rocksdb.BuiltinComparator#BYTEWISE_COMPARATOR} + */ +public class DirectBytewiseComparator extends DirectComparator { + + public DirectBytewiseComparator(final ComparatorOptions copt) { + super(copt); + } + + @Override + public String name() { + return "rocksdb.java.DirectBytewiseComparator"; + } + + @Override + public int compare(final DirectSlice a, final DirectSlice b) { + return a.data().compareTo(b.data()); + } + + @Override + public String findShortestSeparator(final String start, + final DirectSlice limit) { + final byte[] startBytes = start.getBytes(); + + // Find length of common prefix + final int min_length = Math.min(startBytes.length, limit.size()); + int diff_index = 0; + while ((diff_index < min_length) && + (startBytes[diff_index] == limit.get(diff_index))) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + final byte diff_byte = startBytes[diff_index]; + if(diff_byte < 0xff && diff_byte + 1 < limit.get(diff_index)) { + final byte shortest[] = new byte[diff_index + 1]; + System.arraycopy(startBytes, 0, shortest, 0, diff_index + 1); + shortest[diff_index]++; + assert(ByteBuffer.wrap(shortest).compareTo(limit.data()) < 0); + return new String(shortest); + } + } + + return null; + } + + @Override + public String findShortSuccessor(final String key) { + final byte[] keyBytes = key.getBytes(); + + // Find first character that can be incremented + final int n = keyBytes.length; + for (int i = 0; i < n; i++) { + final byte byt = keyBytes[i]; + if (byt != 0xff) { + final byte shortSuccessor[] = new byte[i + 1]; + System.arraycopy(keyBytes, 0, shortSuccessor, 0, i + 1); + shortSuccessor[i]++; + return new String(shortSuccessor); + } + } + // *key is a run of 0xffs. Leave it alone. + + return null; + } +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java b/external/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java index f65b92a0e2..6fccc43bba 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java @@ -18,6 +18,10 @@ public static boolean isUnix() { OS.contains("aix")); } + public static boolean isSolaris() { + return OS.contains("sunos"); + } + public static boolean is64Bit() { return (ARCH.indexOf("64") > 0); } @@ -36,6 +40,10 @@ public static String getJniLibraryName(final String name) { return String.format("%sjni-linux%s", name, arch); } else if (isMac()) { return String.format("%sjni-osx", name); + } else if (isSolaris()) { + return String.format("%sjni-solaris%d", name, is64Bit() ? 64 : 32); + } else if (isWindows() && is64Bit()) { + return String.format("%sjni-win64", name); } throw new UnsupportedOperationException(); } @@ -45,15 +53,20 @@ public static String getJniLibraryFileName(final String name) { } private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) { - if (isUnix()) { + if (isUnix() || isSolaris()) { return libraryFileName + ".so"; } else if (isMac()) { return libraryFileName + (shared ? ".dylib" : ".jnilib"); + } else if (isWindows()) { + return libraryFileName + ".dll"; } throw new UnsupportedOperationException(); } public static String getJniLibraryExtension() { + if (isWindows()) { + return ".dll"; + } return (isMac()) ? ".jnilib" : ".so"; } } diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java b/external/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java new file mode 100644 index 0000000000..beedc185d4 --- /dev/null +++ b/external/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.util; + +import org.rocksdb.BuiltinComparator; +import org.rocksdb.ComparatorOptions; +import org.rocksdb.Slice; + +/** + * This is a Java Native implementation of the C++ + * equivalent ReverseBytewiseComparatorImpl using {@link Slice} + * + * The performance of Comparators implemented in Java is always + * less than their C++ counterparts due to the bridging overhead, + * as such you likely don't want to use this apart from benchmarking + * and you most likely instead wanted + * {@link BuiltinComparator#REVERSE_BYTEWISE_COMPARATOR} + */ +public class ReverseBytewiseComparator extends BytewiseComparator { + + public ReverseBytewiseComparator(final ComparatorOptions copt) { + super(copt); + } + + @Override + public String name() { + return "rocksdb.java.ReverseBytewiseComparator"; + } + + @Override + public int compare(final Slice a, final Slice b) { + return -super.compare(a, b); + } +} diff --git a/external/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java b/external/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java index 8d50cd10e6..e66fc371cd 100644 --- a/external/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java +++ b/external/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java index a776351c01..db4b4d7d01 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,6 +8,7 @@ import java.io.IOException; import java.nio.file.*; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Random; @@ -39,57 +40,43 @@ public abstract class AbstractComparatorTest { * * @throws java.io.IOException if IO error happens. */ - public void testRoundtrip(final Path db_path) throws IOException, RocksDBException { - - Options opt = null; - RocksDB db = null; - - try { - opt = new Options(); - opt.setCreateIfMissing(true); - opt.setComparator(getAscendingIntKeyComparator()); + public void testRoundtrip(final Path db_path) throws IOException, + RocksDBException { + try (final AbstractComparator comparator = getAscendingIntKeyComparator(); + final Options opt = new Options() + .setCreateIfMissing(true) + .setComparator(comparator)) { // store 10,000 random integer keys final int ITERATIONS = 10000; - - db = RocksDB.open(opt, db_path.toString()); - final Random random = new Random(); - for (int i = 0; i < ITERATIONS; i++) { - final byte key[] = intToByte(random.nextInt()); - if (i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates) - i--; // generate a different key - } else { - db.put(key, "value".getBytes()); + try (final RocksDB db = RocksDB.open(opt, db_path.toString())) { + final Random random = new Random(); + for (int i = 0; i < ITERATIONS; i++) { + final byte key[] = intToByte(random.nextInt()); + // does key already exist (avoid duplicates) + if (i > 0 && db.get(key) != null) { + i--; // generate a different key + } else { + db.put(key, "value".getBytes()); + } } } - db.close(); // re-open db and read from start to end // integer keys should be in ascending // order as defined by SimpleIntComparator - db = RocksDB.open(opt, db_path.toString()); - final RocksIterator it = db.newIterator(); - it.seekToFirst(); - int lastKey = Integer.MIN_VALUE; - int count = 0; - for (it.seekToFirst(); it.isValid(); it.next()) { - final int thisKey = byteToInt(it.key()); - assertThat(thisKey).isGreaterThan(lastKey); - lastKey = thisKey; - count++; - } - it.dispose(); - db.close(); - - assertThat(count).isEqualTo(ITERATIONS); - - } finally { - if (db != null) { - db.close(); - } - - if (opt != null) { - opt.dispose(); + try (final RocksDB db = RocksDB.open(opt, db_path.toString()); + final RocksIterator it = db.newIterator()) { + it.seekToFirst(); + int lastKey = Integer.MIN_VALUE; + int count = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + final int thisKey = byteToInt(it.key()); + assertThat(thisKey).isGreaterThan(lastKey); + lastKey = thisKey; + count++; + } + assertThat(count).isEqualTo(ITERATIONS); } } } @@ -109,80 +96,75 @@ public void testRoundtrip(final Path db_path) throws IOException, RocksDBExcepti public void testRoundtripCf(final Path db_path) throws IOException, RocksDBException { - DBOptions opt = null; - RocksDB db = null; - List cfDescriptors = - new ArrayList<>(); - cfDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY)); - cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(), - new ColumnFamilyOptions().setComparator( - getAscendingIntKeyComparator()))); - List cfHandles = new ArrayList<>(); - try { - opt = new DBOptions(). - setCreateIfMissing(true). - setCreateMissingColumnFamilies(true); - - // store 10,000 random integer keys - final int ITERATIONS = 10000; + try(final AbstractComparator comparator = getAscendingIntKeyComparator()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), + new ColumnFamilyOptions().setComparator(comparator)) + ); - db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles); - assertThat(cfDescriptors.size()).isEqualTo(2); - assertThat(cfHandles.size()).isEqualTo(2); + final List cfHandles = new ArrayList<>(); - final Random random = new Random(); - for (int i = 0; i < ITERATIONS; i++) { - final byte key[] = intToByte(random.nextInt()); - if (i > 0 && db.get(cfHandles.get(1), key) != null) { - // does key already exist (avoid duplicates) - i--; // generate a different key - } else { - db.put(cfHandles.get(1), key, "value".getBytes()); + try (final DBOptions opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true)) { + + // store 10,000 random integer keys + final int ITERATIONS = 10000; + + try (final RocksDB db = RocksDB.open(opt, db_path.toString(), + cfDescriptors, cfHandles)) { + try { + assertThat(cfDescriptors.size()).isEqualTo(2); + assertThat(cfHandles.size()).isEqualTo(2); + + final Random random = new Random(); + for (int i = 0; i < ITERATIONS; i++) { + final byte key[] = intToByte(random.nextInt()); + if (i > 0 && db.get(cfHandles.get(1), key) != null) { + // does key already exist (avoid duplicates) + i--; // generate a different key + } else { + db.put(cfHandles.get(1), key, "value".getBytes()); + } + } + } finally { + for (final ColumnFamilyHandle handle : cfHandles) { + handle.close(); + } + } + cfHandles.clear(); } - } - for (ColumnFamilyHandle handle : cfHandles) { - handle.dispose(); - } - cfHandles.clear(); - db.close(); - - // re-open db and read from start to end - // integer keys should be in ascending - // order as defined by SimpleIntComparator - db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles); - assertThat(cfDescriptors.size()).isEqualTo(2); - assertThat(cfHandles.size()).isEqualTo(2); - final RocksIterator it = db.newIterator(cfHandles.get(1)); - it.seekToFirst(); - int lastKey = Integer.MIN_VALUE; - int count = 0; - for (it.seekToFirst(); it.isValid(); it.next()) { - final int thisKey = byteToInt(it.key()); - assertThat(thisKey).isGreaterThan(lastKey); - lastKey = thisKey; - count++; - } - it.dispose(); - for (ColumnFamilyHandle handle : cfHandles) { - handle.dispose(); - } - cfHandles.clear(); - db.close(); - assertThat(count).isEqualTo(ITERATIONS); - - } finally { - for (ColumnFamilyHandle handle : cfHandles) { - handle.dispose(); - } - - if (db != null) { - db.close(); - } - - if (opt != null) { - opt.dispose(); + // re-open db and read from start to end + // integer keys should be in ascending + // order as defined by SimpleIntComparator + try (final RocksDB db = RocksDB.open(opt, db_path.toString(), + cfDescriptors, cfHandles); + final RocksIterator it = db.newIterator(cfHandles.get(1))) { + try { + assertThat(cfDescriptors.size()).isEqualTo(2); + assertThat(cfHandles.size()).isEqualTo(2); + + it.seekToFirst(); + int lastKey = Integer.MIN_VALUE; + int count = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + final int thisKey = byteToInt(it.key()); + assertThat(thisKey).isGreaterThan(lastKey); + lastKey = thisKey; + count++; + } + + assertThat(count).isEqualTo(ITERATIONS); + + } finally { + for (final ColumnFamilyHandle handle : cfHandles) { + handle.close(); + } + } + cfHandles.clear(); + } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java index 48dff19e12..b50ddf4990 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -28,148 +28,96 @@ public class BackupEngineTest { @Test public void backupDb() throws RocksDBException { - Options opt = null; - RocksDB db = null; - try { - opt = new Options().setCreateIfMissing(true); - // Open empty database. - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + // Open empty database. + try(final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + // Fill database with some test values prepareDatabase(db); + // Create two backups - BackupableDBOptions bopt = null; - try { - bopt = new BackupableDBOptions( + try(final BackupableDBOptions bopt = new BackupableDBOptions( backupFolder.getRoot().getAbsolutePath()); - try(final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { - be.createNewBackup(db, false); - be.createNewBackup(db, true); - verifyNumberOfValidBackups(be, 2); - } - } finally { - if(bopt != null) { - bopt.dispose(); - } - } - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { + be.createNewBackup(db, false); + be.createNewBackup(db, true); + verifyNumberOfValidBackups(be, 2); } } } @Test public void deleteBackup() throws RocksDBException { - Options opt = null; - RocksDB db = null; - try { - opt = new Options().setCreateIfMissing(true); - // Open empty database. - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + // Open empty database. + try(final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // Fill database with some test values prepareDatabase(db); // Create two backups - BackupableDBOptions bopt = null; - try { - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - try(final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { - be.createNewBackup(db, false); - be.createNewBackup(db, true); - final List backupInfo = - verifyNumberOfValidBackups(be, 2); - // Delete the first backup - be.deleteBackup(backupInfo.get(0).backupId()); - final List newBackupInfo = - verifyNumberOfValidBackups(be, 1); - - // The second backup must remain. - assertThat(newBackupInfo.get(0).backupId()). - isEqualTo(backupInfo.get(1).backupId()); - } - } finally { - if(bopt != null) { - bopt.dispose(); - } - } - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + try(final BackupableDBOptions bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { + be.createNewBackup(db, false); + be.createNewBackup(db, true); + final List backupInfo = + verifyNumberOfValidBackups(be, 2); + // Delete the first backup + be.deleteBackup(backupInfo.get(0).backupId()); + final List newBackupInfo = + verifyNumberOfValidBackups(be, 1); + + // The second backup must remain. + assertThat(newBackupInfo.get(0).backupId()). + isEqualTo(backupInfo.get(1).backupId()); } } } @Test public void purgeOldBackups() throws RocksDBException { - Options opt = null; - RocksDB db = null; - try { - opt = new Options().setCreateIfMissing(true); - // Open empty database. - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + // Open empty database. + try(final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // Fill database with some test values prepareDatabase(db); // Create four backups - BackupableDBOptions bopt = null; - try { - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - try(final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { - be.createNewBackup(db, false); - be.createNewBackup(db, true); - be.createNewBackup(db, true); - be.createNewBackup(db, true); - final List backupInfo = - verifyNumberOfValidBackups(be, 4); - // Delete everything except the latest backup - be.purgeOldBackups(1); - final List newBackupInfo = - verifyNumberOfValidBackups(be, 1); - // The latest backup must remain. - assertThat(newBackupInfo.get(0).backupId()). - isEqualTo(backupInfo.get(3).backupId()); - } - } finally { - if(bopt != null) { - bopt.dispose(); - } - } - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + try(final BackupableDBOptions bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { + be.createNewBackup(db, false); + be.createNewBackup(db, true); + be.createNewBackup(db, true); + be.createNewBackup(db, true); + final List backupInfo = + verifyNumberOfValidBackups(be, 4); + // Delete everything except the latest backup + be.purgeOldBackups(1); + final List newBackupInfo = + verifyNumberOfValidBackups(be, 1); + // The latest backup must remain. + assertThat(newBackupInfo.get(0).backupId()). + isEqualTo(backupInfo.get(3).backupId()); } } } @Test - public void restoreLatestBackup() - throws RocksDBException { - Options opt = null; - RocksDB db = null; - try { - opt = new Options().setCreateIfMissing(true); + public void restoreLatestBackup() throws RocksDBException { + try(final Options opt = new Options().setCreateIfMissing(true)) { // Open empty database. - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(db); - BackupableDBOptions bopt = null; + RocksDB db = null; try { - bopt = new BackupableDBOptions( + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(db); + + try (final BackupableDBOptions bopt = new BackupableDBOptions( backupFolder.getRoot().getAbsolutePath()); - try (final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { + final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { be.createNewBackup(db, true); verifyNumberOfValidBackups(be, 1); db.put("key1".getBytes(), "valueV2".getBytes()); @@ -182,51 +130,44 @@ public void restoreLatestBackup() assertThat(new String(db.get("key2".getBytes()))).endsWith("V3"); db.close(); + db = null; verifyNumberOfValidBackups(be, 2); // restore db from latest backup - be.restoreDbFromLatestBackup(dbFolder.getRoot().getAbsolutePath(), - dbFolder.getRoot().getAbsolutePath(), - new RestoreOptions(false)); + try(final RestoreOptions ropts = new RestoreOptions(false)) { + be.restoreDbFromLatestBackup(dbFolder.getRoot().getAbsolutePath(), + dbFolder.getRoot().getAbsolutePath(), ropts); + } + // Open database again. - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath()); + // Values must have suffix V2 because of restoring latest backup. assertThat(new String(db.get("key1".getBytes()))).endsWith("V2"); assertThat(new String(db.get("key2".getBytes()))).endsWith("V2"); } } finally { - if(bopt != null) { - bopt.dispose(); + if(db != null) { + db.close(); } } - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void restoreFromBackup() throws RocksDBException { - Options opt = null; - RocksDB db = null; - try { - opt = new Options().setCreateIfMissing(true); - // Open empty database. - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(db); - BackupableDBOptions bopt = null; + try(final Options opt = new Options().setCreateIfMissing(true)) { + RocksDB db = null; try { - bopt = new BackupableDBOptions( + // Open empty database. + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(db); + try (final BackupableDBOptions bopt = new BackupableDBOptions( backupFolder.getRoot().getAbsolutePath()); - try (final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { + final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) { be.createNewBackup(db, true); verifyNumberOfValidBackups(be, 1); db.put("key1".getBytes(), "valueV2".getBytes()); @@ -240,9 +181,10 @@ public void restoreFromBackup() //close the database db.close(); + db = null; //restore the backup - List backupInfo = verifyNumberOfValidBackups(be, 2); + final List backupInfo = verifyNumberOfValidBackups(be, 2); // restore db from first backup be.restoreDbFromBackup(backupInfo.get(0).backupId(), dbFolder.getRoot().getAbsolutePath(), @@ -256,17 +198,10 @@ public void restoreFromBackup() assertThat(new String(db.get("key2".getBytes()))).endsWith("V1"); } } finally { - if(bopt != null) { - bopt.dispose(); + if(db != null) { + db.close(); } } - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java index 6fe3bd2f0b..c3836ac9b1 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java @@ -1,22 +1,23 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Random; + import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; -import java.util.Random; - -import static org.assertj.core.api.Assertions.assertThat; - public class BackupableDBOptionsTest { - private final static String ARBITRARY_PATH = "/tmp"; + private final static String ARBITRARY_PATH = + System.getProperty("java.io.tmpdir"); @ClassRule public static final RocksMemoryResource rocksMemoryResource = @@ -30,87 +31,61 @@ public class BackupableDBOptionsTest { @Test public void backupDir() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { assertThat(backupableDBOptions.backupDir()). isEqualTo(ARBITRARY_PATH); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void shareTableFiles() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - boolean value = rand.nextBoolean(); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { + final boolean value = rand.nextBoolean(); backupableDBOptions.setShareTableFiles(value); assertThat(backupableDBOptions.shareTableFiles()). isEqualTo(value); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void sync() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - boolean value = rand.nextBoolean(); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { + final boolean value = rand.nextBoolean(); backupableDBOptions.setSync(value); assertThat(backupableDBOptions.sync()).isEqualTo(value); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void destroyOldData() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - boolean value = rand.nextBoolean(); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH);) { + final boolean value = rand.nextBoolean(); backupableDBOptions.setDestroyOldData(value); assertThat(backupableDBOptions.destroyOldData()). isEqualTo(value); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void backupLogFiles() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - boolean value = rand.nextBoolean(); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { + final boolean value = rand.nextBoolean(); backupableDBOptions.setBackupLogFiles(value); assertThat(backupableDBOptions.backupLogFiles()). isEqualTo(value); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void backupRateLimit() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - long value = Math.abs(rand.nextLong()); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { + final long value = Math.abs(rand.nextLong()); backupableDBOptions.setBackupRateLimit(value); assertThat(backupableDBOptions.backupRateLimit()). isEqualTo(value); @@ -118,19 +93,14 @@ public void backupRateLimit() { backupableDBOptions.setBackupRateLimit(-1); assertThat(backupableDBOptions.backupRateLimit()). isEqualTo(0); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void restoreRateLimit() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - long value = Math.abs(rand.nextLong()); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { + final long value = Math.abs(rand.nextLong()); backupableDBOptions.setRestoreRateLimit(value); assertThat(backupableDBOptions.restoreRateLimit()). isEqualTo(value); @@ -138,145 +108,153 @@ public void restoreRateLimit() { backupableDBOptions.setRestoreRateLimit(-1); assertThat(backupableDBOptions.restoreRateLimit()). isEqualTo(0); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void shareFilesWithChecksum() { - BackupableDBOptions backupableDBOptions = null; - try { - backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + try (final BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH)) { boolean value = rand.nextBoolean(); backupableDBOptions.setShareFilesWithChecksum(value); assertThat(backupableDBOptions.shareFilesWithChecksum()). isEqualTo(value); - } finally { - if (backupableDBOptions != null) { - backupableDBOptions.dispose(); - } } } @Test public void failBackupDirIsNull() { exception.expect(IllegalArgumentException.class); - new BackupableDBOptions(null); + try (final BackupableDBOptions opts = new BackupableDBOptions(null)) { + //no-op + } } @Test - public void failBackupDirIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.backupDir(); + public void failBackupDirIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.backupDir(); + } } @Test - public void failSetShareTableFilesIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setShareTableFiles(true); + public void failSetShareTableFilesIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setShareTableFiles(true); + } } @Test - public void failShareTableFilesIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.shareTableFiles(); + public void failShareTableFilesIfDisposed() { + try (BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.shareTableFiles(); + } } @Test - public void failSetSyncIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setSync(true); + public void failSetSyncIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setSync(true); + } } @Test - public void failSyncIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.sync(); + public void failSyncIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.sync(); + } } @Test - public void failSetDestroyOldDataIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setDestroyOldData(true); + public void failSetDestroyOldDataIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setDestroyOldData(true); + } } @Test - public void failDestroyOldDataIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.destroyOldData(); + public void failDestroyOldDataIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.destroyOldData(); + } } @Test - public void failSetBackupLogFilesIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setBackupLogFiles(true); + public void failSetBackupLogFilesIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setBackupLogFiles(true); + } } @Test - public void failBackupLogFilesIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.backupLogFiles(); + public void failBackupLogFilesIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.backupLogFiles(); + } } @Test - public void failSetBackupRateLimitIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setBackupRateLimit(1); + public void failSetBackupRateLimitIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setBackupRateLimit(1); + } } @Test - public void failBackupRateLimitIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.backupRateLimit(); + public void failBackupRateLimitIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.backupRateLimit(); + } } @Test - public void failSetRestoreRateLimitIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setRestoreRateLimit(1); + public void failSetRestoreRateLimitIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setRestoreRateLimit(1); + } } @Test - public void failRestoreRateLimitIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.restoreRateLimit(); + public void failRestoreRateLimitIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.restoreRateLimit(); + } } @Test - public void failSetShareFilesWithChecksumIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.setShareFilesWithChecksum(true); + public void failSetShareFilesWithChecksumIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.setShareFilesWithChecksum(true); + } } @Test - public void failShareFilesWithChecksumIfDisposed(){ - BackupableDBOptions options = setupUninitializedBackupableDBOptions( - exception); - options.shareFilesWithChecksum(); + public void failShareFilesWithChecksumIfDisposed() { + try (final BackupableDBOptions options = + setupUninitializedBackupableDBOptions(exception)) { + options.shareFilesWithChecksum(); + } } private BackupableDBOptions setupUninitializedBackupableDBOptions( ExpectedException exception) { - BackupableDBOptions backupableDBOptions = + final BackupableDBOptions backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); - backupableDBOptions.dispose(); + backupableDBOptions.close(); exception.expect(AssertionError.class); return backupableDBOptions; } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java deleted file mode 100644 index 3f358bdb7b..0000000000 --- a/external/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java +++ /dev/null @@ -1,425 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -import java.util.List; - -import static org.assertj.core.api.Assertions.assertThat; - -public class BackupableDBTest { - - @ClassRule - public static final RocksMemoryResource rocksMemoryResource = - new RocksMemoryResource(); - - @Rule - public TemporaryFolder dbFolder = new TemporaryFolder(); - - @Rule - public TemporaryFolder backupFolder = new TemporaryFolder(); - - @Test - public void backupDb() throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - // Create two backups - bdb.createNewBackup(false); - bdb.createNewBackup(true); - verifyNumberOfValidBackups(bdb, 2); - } finally { - if (bdb != null) { - bdb.close(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void deleteBackup() throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - // Create two backups - bdb.createNewBackup(false); - bdb.createNewBackup(true); - List backupInfo = - verifyNumberOfValidBackups(bdb, 2); - // Delete the first backup - bdb.deleteBackup(backupInfo.get(0).backupId()); - List newBackupInfo = - verifyNumberOfValidBackups(bdb, 1); - // The second backup must remain. - assertThat(newBackupInfo.get(0).backupId()). - isEqualTo(backupInfo.get(1).backupId()); - } finally { - if (bdb != null) { - bdb.close(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void deleteBackupWithRestoreBackupableDB() - throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - RestoreBackupableDB rdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - // Create two backups - bdb.createNewBackup(false); - bdb.createNewBackup(true); - List backupInfo = - verifyNumberOfValidBackups(bdb, 2); - // init RestoreBackupableDB - rdb = new RestoreBackupableDB(bopt); - // Delete the first backup - rdb.deleteBackup(backupInfo.get(0).backupId()); - // Fetch backup info using RestoreBackupableDB - List newBackupInfo = verifyNumberOfValidBackups(rdb, 1); - // The second backup must remain. - assertThat(newBackupInfo.get(0).backupId()). - isEqualTo(backupInfo.get(1).backupId()); - } finally { - if (bdb != null) { - bdb.close(); - } - if (rdb != null) { - rdb.dispose(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void purgeOldBackups() throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - // Create two backups - bdb.createNewBackup(false); - bdb.createNewBackup(true); - bdb.createNewBackup(true); - bdb.createNewBackup(true); - List backupInfo = - verifyNumberOfValidBackups(bdb, 4); - // Delete everything except the latest backup - bdb.purgeOldBackups(1); - List newBackupInfo = - verifyNumberOfValidBackups(bdb, 1); - // The latest backup must remain. - assertThat(newBackupInfo.get(0).backupId()). - isEqualTo(backupInfo.get(3).backupId()); - } finally { - if (bdb != null) { - bdb.close(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void purgeOldBackupsWithRestoreBackupableDb() - throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - RestoreBackupableDB rdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - // Create two backups - bdb.createNewBackup(false); - bdb.createNewBackup(true); - bdb.createNewBackup(true); - bdb.createNewBackup(true); - List infos = verifyNumberOfValidBackups(bdb, 4); - assertThat(infos.get(1).size()). - isEqualTo(infos.get(2).size()); - assertThat(infos.get(1).numberFiles()). - isEqualTo(infos.get(2).numberFiles()); - long maxTimeBeforePurge = Long.MIN_VALUE; - for (BackupInfo backupInfo : infos) { - if (maxTimeBeforePurge < backupInfo.timestamp()) { - maxTimeBeforePurge = backupInfo.timestamp(); - } - } - // init RestoreBackupableDB - rdb = new RestoreBackupableDB(bopt); - // the same number of backups must - // exist using RestoreBackupableDB. - verifyNumberOfValidBackups(rdb, 4); - rdb.purgeOldBackups(1); - infos = verifyNumberOfValidBackups(rdb, 1); - assertThat(infos.get(0).timestamp()). - isEqualTo(maxTimeBeforePurge); - } finally { - if (bdb != null) { - bdb.close(); - } - if (rdb != null) { - rdb.dispose(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void restoreLatestBackup() - throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - RestoreBackupableDB rdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - bdb.createNewBackup(true); - verifyNumberOfValidBackups(bdb, 1); - bdb.put("key1".getBytes(), "valueV2".getBytes()); - bdb.put("key2".getBytes(), "valueV2".getBytes()); - bdb.createNewBackup(true); - verifyNumberOfValidBackups(bdb, 2); - bdb.put("key1".getBytes(), "valueV3".getBytes()); - bdb.put("key2".getBytes(), "valueV3".getBytes()); - assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3"); - assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3"); - bdb.close(); - - // init RestoreBackupableDB - rdb = new RestoreBackupableDB(bopt); - verifyNumberOfValidBackups(rdb, 2); - // restore db from latest backup - rdb.restoreDBFromLatestBackup(dbFolder.getRoot().getAbsolutePath(), - dbFolder.getRoot().getAbsolutePath(), - new RestoreOptions(false)); - // Open database again. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Values must have suffix V2 because of restoring latest backup. - assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V2"); - assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V2"); - } finally { - if (bdb != null) { - bdb.close(); - } - if (rdb != null) { - rdb.dispose(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void restoreFromBackup() - throws RocksDBException { - Options opt = null; - BackupableDBOptions bopt = null; - BackupableDB bdb = null; - RestoreBackupableDB rdb = null; - try { - opt = new Options().setCreateIfMissing(true); - bopt = new BackupableDBOptions( - backupFolder.getRoot().getAbsolutePath()); - assertThat(bopt.backupDir()).isEqualTo( - backupFolder.getRoot().getAbsolutePath()); - // Open empty database. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Fill database with some test values - prepareDatabase(bdb); - bdb.createNewBackup(true); - verifyNumberOfValidBackups(bdb, 1); - bdb.put("key1".getBytes(), "valueV2".getBytes()); - bdb.put("key2".getBytes(), "valueV2".getBytes()); - bdb.createNewBackup(true); - verifyNumberOfValidBackups(bdb, 2); - bdb.put("key1".getBytes(), "valueV3".getBytes()); - bdb.put("key2".getBytes(), "valueV3".getBytes()); - assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3"); - assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3"); - bdb.close(); - - // init RestoreBackupableDB - rdb = new RestoreBackupableDB(bopt); - List backupInfo = verifyNumberOfValidBackups(rdb, 2); - // restore db from first backup - rdb.restoreDBFromBackup(backupInfo.get(0).backupId(), - dbFolder.getRoot().getAbsolutePath(), - dbFolder.getRoot().getAbsolutePath(), - new RestoreOptions(false)); - // Open database again. - bdb = BackupableDB.open(opt, bopt, - dbFolder.getRoot().getAbsolutePath()); - // Values must have suffix V2 because of restoring latest backup. - assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V1"); - assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V1"); - } finally { - if (bdb != null) { - bdb.close(); - } - if (rdb != null) { - rdb.dispose(); - } - if (bopt != null) { - bopt.dispose(); - } - if (opt != null) { - opt.dispose(); - } - } - } - - /** - * Verify backups. - * - * @param bdb {@link BackupableDB} instance. - * @param expectedNumberOfBackups numerical value - * @throws RocksDBException thrown if an error occurs within the native - * part of the library. - */ - private List verifyNumberOfValidBackups(BackupableDB bdb, - int expectedNumberOfBackups) throws RocksDBException { - // Verify that backups exist - assertThat(bdb.getCorruptedBackups().length). - isEqualTo(0); - bdb.garbageCollect(); - List backupInfo = bdb.getBackupInfos(); - assertThat(backupInfo.size()). - isEqualTo(expectedNumberOfBackups); - return backupInfo; - } - - /** - * Verify backups. - * - * @param rdb {@link RestoreBackupableDB} instance. - * @param expectedNumberOfBackups numerical value - * @throws RocksDBException thrown if an error occurs within the native - * part of the library. - */ - private List verifyNumberOfValidBackups( - RestoreBackupableDB rdb, int expectedNumberOfBackups) - throws RocksDBException { - // Verify that backups exist - assertThat(rdb.getCorruptedBackups().length). - isEqualTo(0); - rdb.garbageCollect(); - List backupInfo = rdb.getBackupInfos(); - assertThat(backupInfo.size()). - isEqualTo(expectedNumberOfBackups); - return backupInfo; - } - - /** - * Fill database with some test values. - * - * @param db {@link RocksDB} instance. - * @throws RocksDBException thrown if an error occurs within the native - * part of the library. - */ - private void prepareDatabase(RocksDB db) - throws RocksDBException { - db.put("key1".getBytes(), "valueV1".getBytes()); - db.put("key2".getBytes(), "valueV1".getBytes()); - } -} diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java index aacf44054c..ccb7b7625f 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -131,34 +131,20 @@ public void blockSize() { @Test public void blockBasedTableWithFilter() { - Options options = null; - try { - options = new Options(); - options.setTableFormatConfig( - new BlockBasedTableConfig().setFilter( - new BloomFilter(10))); + try(final Options options = new Options() + .setTableFormatConfig(new BlockBasedTableConfig() + .setFilter(new BloomFilter(10)))) { assertThat(options.tableFactoryName()). isEqualTo("BlockBasedTable"); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void blockBasedTableWithoutFilter() { - Options options = null; - try { - options = new Options(); - options.setTableFormatConfig( - new BlockBasedTableConfig().setFilter(null)); + try(final Options options = new Options().setTableFormatConfig( + new BlockBasedTableConfig().setFilter(null))) { assertThat(options.tableFactoryName()). isEqualTo("BlockBasedTable"); - } finally { - if (options != null) { - options.dispose(); - } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java index 3081e585a8..e79569fb8b 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java @@ -22,76 +22,61 @@ public class CheckPointTest { @Test public void checkPoint() throws RocksDBException { - RocksDB db = null; - Options options = null; - Checkpoint checkpoint = null; - try { - options = new Options(). - setCreateIfMissing(true); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.put("key".getBytes(), "value".getBytes()); - checkpoint = Checkpoint.create(db); - checkpoint.createCheckpoint(checkpointFolder. - getRoot().getAbsolutePath() + "/snapshot1"); - db.put("key2".getBytes(), "value2".getBytes()); - checkpoint.createCheckpoint(checkpointFolder. - getRoot().getAbsolutePath() + "/snapshot2"); - db.close(); - db = RocksDB.open(options, - checkpointFolder.getRoot().getAbsolutePath() + - "/snapshot1"); - assertThat(new String(db.get("key".getBytes()))). - isEqualTo("value"); - assertThat(db.get("key2".getBytes())).isNull(); - db.close(); - db = RocksDB.open(options, - checkpointFolder.getRoot().getAbsolutePath() + - "/snapshot2"); - assertThat(new String(db.get("key".getBytes()))). - isEqualTo("value"); - assertThat(new String(db.get("key2".getBytes()))). - isEqualTo("value2"); - } finally { - if (db != null) { - db.close(); + try (final Options options = new Options(). + setCreateIfMissing(true)) { + + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + db.put("key".getBytes(), "value".getBytes()); + try (final Checkpoint checkpoint = Checkpoint.create(db)) { + checkpoint.createCheckpoint(checkpointFolder. + getRoot().getAbsolutePath() + "/snapshot1"); + db.put("key2".getBytes(), "value2".getBytes()); + checkpoint.createCheckpoint(checkpointFolder. + getRoot().getAbsolutePath() + "/snapshot2"); + } } - if (options != null) { - options.dispose(); + + try (final RocksDB db = RocksDB.open(options, + checkpointFolder.getRoot().getAbsolutePath() + + "/snapshot1")) { + assertThat(new String(db.get("key".getBytes()))). + isEqualTo("value"); + assertThat(db.get("key2".getBytes())).isNull(); } - if (checkpoint != null) { - checkpoint.dispose(); + + try (final RocksDB db = RocksDB.open(options, + checkpointFolder.getRoot().getAbsolutePath() + + "/snapshot2")) { + assertThat(new String(db.get("key".getBytes()))). + isEqualTo("value"); + assertThat(new String(db.get("key2".getBytes()))). + isEqualTo("value2"); } } } @Test(expected = IllegalArgumentException.class) public void failIfDbIsNull() { - Checkpoint.create(null); + try (final Checkpoint checkpoint = Checkpoint.create(null)) { + + } } @Test(expected = IllegalStateException.class) public void failIfDbNotInitialized() throws RocksDBException { - RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); - db.dispose(); - Checkpoint.create(db); + try (final RocksDB db = RocksDB.open( + dbFolder.getRoot().getAbsolutePath())) { + db.close(); + Checkpoint.create(db); + } } @Test(expected = RocksDBException.class) public void failWithIllegalPath() throws RocksDBException { - RocksDB db = null; - Checkpoint checkpoint = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); - checkpoint = Checkpoint.create(db); + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + final Checkpoint checkpoint = Checkpoint.create(db)) { checkpoint.createCheckpoint("/Z:///:\\C:\\TZ/-"); - } finally { - if (db != null) { - db.close(); - } - if (checkpoint != null) { - checkpoint.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java index af7216128b..18c5270e8e 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,616 +26,368 @@ public class ColumnFamilyOptionsTest { @Test public void getColumnFamilyOptionsFromProps() { - ColumnFamilyOptions opt = null; - try { + Properties properties = new Properties(); + properties.put("write_buffer_size", "112"); + properties.put("max_write_buffer_number", "13"); + + try (final ColumnFamilyOptions opt = ColumnFamilyOptions. + getColumnFamilyOptionsFromProps(properties)) { // setup sample properties - Properties properties = new Properties(); - properties.put("write_buffer_size", "112"); - properties.put("max_write_buffer_number", "13"); - opt = ColumnFamilyOptions. - getColumnFamilyOptionsFromProps(properties); assertThat(opt).isNotNull(); assertThat(String.valueOf(opt.writeBufferSize())). isEqualTo(properties.get("write_buffer_size")); assertThat(String.valueOf(opt.maxWriteBufferNumber())). isEqualTo(properties.get("max_write_buffer_number")); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void failColumnFamilyOptionsFromPropsWithIllegalValue() { - ColumnFamilyOptions opt = null; - try { - // setup sample properties - Properties properties = new Properties(); - properties.put("tomato", "1024"); - properties.put("burger", "2"); - opt = ColumnFamilyOptions. - getColumnFamilyOptionsFromProps(properties); + // setup sample properties + final Properties properties = new Properties(); + properties.put("tomato", "1024"); + properties.put("burger", "2"); + + try (final ColumnFamilyOptions opt = + ColumnFamilyOptions.getColumnFamilyOptionsFromProps(properties)) { assertThat(opt).isNull(); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test(expected = IllegalArgumentException.class) public void failColumnFamilyOptionsFromPropsWithNullValue() { - ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null); + try (final ColumnFamilyOptions opt = + ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null)) { + } } @Test(expected = IllegalArgumentException.class) public void failColumnFamilyOptionsFromPropsWithEmptyProps() { - ColumnFamilyOptions.getColumnFamilyOptionsFromProps( - new Properties()); + try (final ColumnFamilyOptions opt = + ColumnFamilyOptions.getColumnFamilyOptionsFromProps( + new Properties())) { + } } @Test public void writeBufferSize() throws RocksDBException { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - long longValue = rand.nextLong(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setWriteBufferSize(longValue); assertThat(opt.writeBufferSize()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxWriteBufferNumber() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setMaxWriteBufferNumber(intValue); assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void minWriteBufferNumberToMerge() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setMinWriteBufferNumberToMerge(intValue); assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void numLevels() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setNumLevels(intValue); assertThat(opt.numLevels()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelZeroFileNumCompactionTrigger() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setLevelZeroFileNumCompactionTrigger(intValue); assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelZeroSlowdownWritesTrigger() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setLevelZeroSlowdownWritesTrigger(intValue); assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelZeroStopWritesTrigger() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setLevelZeroStopWritesTrigger(intValue); assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void targetFileSizeBase() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - long longValue = rand.nextLong(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setTargetFileSizeBase(longValue); assertThat(opt.targetFileSizeBase()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void targetFileSizeMultiplier() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setTargetFileSizeMultiplier(intValue); assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxBytesForLevelBase() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - long longValue = rand.nextLong(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setMaxBytesForLevelBase(longValue); assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelCompactionDynamicLevelBytes() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { final boolean boolValue = rand.nextBoolean(); opt.setLevelCompactionDynamicLevelBytes(boolValue); assertThat(opt.levelCompactionDynamicLevelBytes()) .isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxBytesForLevelMultiplier() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setMaxBytesForLevelMultiplier(intValue); assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void expandedCompactionFactor() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setExpandedCompactionFactor(intValue); assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void sourceCompactionFactor() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setSourceCompactionFactor(intValue); assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxGrandparentOverlapFactor() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setMaxGrandparentOverlapFactor(intValue); assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void softRateLimit() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - double doubleValue = rand.nextDouble(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final double doubleValue = rand.nextDouble(); opt.setSoftRateLimit(doubleValue); assertThat(opt.softRateLimit()).isEqualTo(doubleValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void hardRateLimit() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - double doubleValue = rand.nextDouble(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final double doubleValue = rand.nextDouble(); opt.setHardRateLimit(doubleValue); assertThat(opt.hardRateLimit()).isEqualTo(doubleValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void rateLimitDelayMaxMilliseconds() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setRateLimitDelayMaxMilliseconds(intValue); assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void arenaBlockSize() throws RocksDBException { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - long longValue = rand.nextLong(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setArenaBlockSize(longValue); assertThat(opt.arenaBlockSize()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void disableAutoCompactions() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - boolean boolValue = rand.nextBoolean(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setDisableAutoCompactions(boolValue); assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void purgeRedundantKvsWhileFlush() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - boolean boolValue = rand.nextBoolean(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setPurgeRedundantKvsWhileFlush(boolValue); assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void verifyChecksumsInCompaction() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - boolean boolValue = rand.nextBoolean(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setVerifyChecksumsInCompaction(boolValue); assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void filterDeletes() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - boolean boolValue = rand.nextBoolean(); - opt.setFilterDeletes(boolValue); - assertThat(opt.filterDeletes()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxSequentialSkipInIterations() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - long longValue = rand.nextLong(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setMaxSequentialSkipInIterations(longValue); assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void inplaceUpdateSupport() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - boolean boolValue = rand.nextBoolean(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setInplaceUpdateSupport(boolValue); assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void inplaceUpdateNumLocks() throws RocksDBException { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - long longValue = rand.nextLong(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setInplaceUpdateNumLocks(longValue); assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void memtablePrefixBloomBits() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); - int intValue = rand.nextInt(); - opt.setMemtablePrefixBloomBits(intValue); - assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test - public void memtablePrefixBloomProbes() { - ColumnFamilyOptions opt = null; - try { - int intValue = rand.nextInt(); - opt = new ColumnFamilyOptions(); - opt.setMemtablePrefixBloomProbes(intValue); - assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } + public void memtablePrefixBloomSizeRatio() { + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final double doubleValue = rand.nextDouble(); + opt.setMemtablePrefixBloomSizeRatio(doubleValue); + assertThat(opt.memtablePrefixBloomSizeRatio()).isEqualTo(doubleValue); } } @Test public void bloomLocality() { - ColumnFamilyOptions opt = null; - try { - int intValue = rand.nextInt(); - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setBloomLocality(intValue); assertThat(opt.bloomLocality()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxSuccessiveMerges() throws RocksDBException { - ColumnFamilyOptions opt = null; - try { - long longValue = rand.nextLong(); - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final long longValue = rand.nextLong(); opt.setMaxSuccessiveMerges(longValue); assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void minPartialMergeOperands() { - ColumnFamilyOptions opt = null; - try { - int intValue = rand.nextInt(); - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final int intValue = rand.nextInt(); opt.setMinPartialMergeOperands(intValue); assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void optimizeFiltersForHits() { - ColumnFamilyOptions opt = null; - try { - boolean aBoolean = rand.nextBoolean(); - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { + final boolean aBoolean = rand.nextBoolean(); opt.setOptimizeFiltersForHits(aBoolean); assertThat(opt.optimizeFiltersForHits()).isEqualTo(aBoolean); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void memTable() throws RocksDBException { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { opt.setMemTableConfig(new HashLinkedListMemTableConfig()); assertThat(opt.memTableFactoryName()). isEqualTo("HashLinkedListRepFactory"); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void comparator() throws RocksDBException { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { opt.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void linkageOfPrepMethods() { - ColumnFamilyOptions options = null; - try { - options = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { options.optimizeUniversalStyleCompaction(); options.optimizeUniversalStyleCompaction(4000); options.optimizeLevelStyleCompaction(); options.optimizeLevelStyleCompaction(3000); options.optimizeForPointLookup(10); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void shouldSetTestPrefixExtractor() { - ColumnFamilyOptions options = null; - try { - options = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { options.useFixedLengthPrefixExtractor(100); options.useFixedLengthPrefixExtractor(10); - } finally { - if (options != null) { - options.dispose(); - } } } - @Test public void shouldSetTestCappedPrefixExtractor() { - ColumnFamilyOptions options = null; - try { - options = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { options.useCappedPrefixExtractor(100); options.useCappedPrefixExtractor(10); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void compressionTypes() { - ColumnFamilyOptions columnFamilyOptions = null; - try { - columnFamilyOptions = new ColumnFamilyOptions(); - for (CompressionType compressionType : + try (final ColumnFamilyOptions columnFamilyOptions + = new ColumnFamilyOptions()) { + for (final CompressionType compressionType : CompressionType.values()) { columnFamilyOptions.setCompressionType(compressionType); assertThat(columnFamilyOptions.compressionType()). @@ -643,21 +395,16 @@ public void compressionTypes() { assertThat(CompressionType.valueOf("NO_COMPRESSION")). isEqualTo(CompressionType.NO_COMPRESSION); } - } finally { - if (columnFamilyOptions != null) { - columnFamilyOptions.dispose(); - } } } @Test public void compressionPerLevel() { - ColumnFamilyOptions columnFamilyOptions = null; - try { - columnFamilyOptions = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions columnFamilyOptions + = new ColumnFamilyOptions()) { assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty(); List compressionTypeList = new ArrayList<>(); - for (int i=0; i < columnFamilyOptions.numLevels(); i++) { + for (int i = 0; i < columnFamilyOptions.numLevels(); i++) { compressionTypeList.add(CompressionType.NO_COMPRESSION); } columnFamilyOptions.setCompressionPerLevel(compressionTypeList); @@ -666,18 +413,13 @@ public void compressionPerLevel() { assertThat(compressionType).isEqualTo( CompressionType.NO_COMPRESSION); } - } finally { - if (columnFamilyOptions != null) { - columnFamilyOptions.dispose(); - } } } @Test public void differentCompressionsPerLevel() { - ColumnFamilyOptions columnFamilyOptions = null; - try { - columnFamilyOptions = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions columnFamilyOptions + = new ColumnFamilyOptions()) { columnFamilyOptions.setNumLevels(3); assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty(); @@ -697,38 +439,27 @@ public void differentCompressionsPerLevel() { CompressionType.SNAPPY_COMPRESSION, CompressionType.LZ4_COMPRESSION); - } finally { - if (columnFamilyOptions != null) { - columnFamilyOptions.dispose(); - } } } @Test public void compactionStyles() { - ColumnFamilyOptions ColumnFamilyOptions = null; - try { - ColumnFamilyOptions = new ColumnFamilyOptions(); - for (CompactionStyle compactionStyle : + try (final ColumnFamilyOptions columnFamilyOptions + = new ColumnFamilyOptions()) { + for (final CompactionStyle compactionStyle : CompactionStyle.values()) { - ColumnFamilyOptions.setCompactionStyle(compactionStyle); - assertThat(ColumnFamilyOptions.compactionStyle()). + columnFamilyOptions.setCompactionStyle(compactionStyle); + assertThat(columnFamilyOptions.compactionStyle()). isEqualTo(compactionStyle); assertThat(CompactionStyle.valueOf("FIFO")). isEqualTo(CompactionStyle.FIFO); } - } finally { - if (ColumnFamilyOptions != null) { - ColumnFamilyOptions.dispose(); - } } } @Test public void maxTableFilesSizeFIFO() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { long longValue = rand.nextLong(); // Size has to be positive longValue = (longValue < 0) ? -longValue : longValue; @@ -736,10 +467,6 @@ public void maxTableFilesSizeFIFO() { opt.setMaxTableFilesSizeFIFO(longValue); assertThat(opt.maxTableFilesSizeFIFO()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java index decdbbcb21..c5b4fe96af 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,432 +25,350 @@ public class ColumnFamilyTest { @Test public void listColumnFamilies() throws RocksDBException { - RocksDB db = null; - Options options = null; - try { - options = new Options(); - options.setCreateIfMissing(true); - - DBOptions dbOptions = new DBOptions(); - dbOptions.setCreateIfMissing(true); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { // Test listColumnFamilies - List columnFamilyNames; - columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + final List columnFamilyNames = RocksDB.listColumnFamilies(options, + dbFolder.getRoot().getAbsolutePath()); assertThat(columnFamilyNames).isNotNull(); assertThat(columnFamilyNames.size()).isGreaterThan(0); assertThat(columnFamilyNames.size()).isEqualTo(1); assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default"); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } } } @Test public void defaultColumnFamily() throws RocksDBException { - RocksDB db = null; - Options options = null; - ColumnFamilyHandle cfh; - try { - options = new Options().setCreateIfMissing(true); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + final ColumnFamilyHandle cfh = db.getDefaultColumnFamily(); + try { + assertThat(cfh).isNotNull(); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - cfh = db.getDefaultColumnFamily(); - assertThat(cfh).isNotNull(); + final byte[] key = "key".getBytes(); + final byte[] value = "value".getBytes(); - final byte[] key = "key".getBytes(); - final byte[] value = "value".getBytes(); + db.put(cfh, key, value); - db.put(cfh, key, value); + final byte[] actualValue = db.get(cfh, key); - final byte[] actualValue = db.get(cfh, key); - - assertThat(cfh).isNotNull(); - assertThat(actualValue).isEqualTo(value); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + assertThat(cfh).isNotNull(); + assertThat(actualValue).isEqualTo(value); + } finally { + cfh.close(); } } } @Test public void createColumnFamily() throws RocksDBException { - RocksDB db = null; - Options options = null; - ColumnFamilyHandle columnFamilyHandle = null; - try { - options = new Options(); - options.setCreateIfMissing(true); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - columnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions())); - - List columnFamilyNames; - columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); - assertThat(columnFamilyNames).isNotNull(); - assertThat(columnFamilyNames.size()).isGreaterThan(0); - assertThat(columnFamilyNames.size()).isEqualTo(2); - assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default"); - assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf"); - } finally { - if (columnFamilyHandle != null) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf".getBytes(), + new ColumnFamilyOptions())); + try { + final List columnFamilyNames = RocksDB.listColumnFamilies( + options, dbFolder.getRoot().getAbsolutePath()); + assertThat(columnFamilyNames).isNotNull(); + assertThat(columnFamilyNames.size()).isGreaterThan(0); + assertThat(columnFamilyNames.size()).isEqualTo(2); + assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default"); + assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf"); + } finally { + columnFamilyHandle.close(); } } } @Test public void openWithColumnFamilies() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = + final List cfNames = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes()) + ); + + final List columnFamilyHandleList = new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - options.setCreateMissingColumnFamilies(true); - // Test open database with column family names - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - assertThat(columnFamilyHandleList.size()).isEqualTo(2); - db.put("dfkey1".getBytes(), "dfvalue".getBytes()); - db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), - "dfvalue".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), - "newcfvalue".getBytes()); - - String retVal = new String(db.get(columnFamilyHandleList.get(1), - "newcfkey1".getBytes())); - assertThat(retVal).isEqualTo("newcfvalue"); - assertThat((db.get(columnFamilyHandleList.get(1), - "dfkey1".getBytes()))).isNull(); - db.remove(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); - assertThat((db.get(columnFamilyHandleList.get(1), - "newcfkey1".getBytes()))).isNull(); - db.remove(columnFamilyHandleList.get(0), new WriteOptions(), - "dfkey2".getBytes()); - assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), - "dfkey2".getBytes())).isNull(); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + + // Test open database with column family names + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), cfNames, + columnFamilyHandleList)) { + + try { + assertThat(columnFamilyHandleList.size()).isEqualTo(2); + db.put("dfkey1".getBytes(), "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), + "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), + "newcfvalue".getBytes()); + + String retVal = new String(db.get(columnFamilyHandleList.get(1), + "newcfkey1".getBytes())); + assertThat(retVal).isEqualTo("newcfvalue"); + assertThat((db.get(columnFamilyHandleList.get(1), + "dfkey1".getBytes()))).isNull(); + db.remove(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); + assertThat((db.get(columnFamilyHandleList.get(1), + "newcfkey1".getBytes()))).isNull(); + db.remove(columnFamilyHandleList.get(0), new WriteOptions(), + "dfkey2".getBytes()); + assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), + "dfkey2".getBytes())).isNull(); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test public void getWithOutValueAndCf() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfDescriptors = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - options.setCreateMissingColumnFamilies(true); - // Test open database with column family names - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, columnFamilyHandleList); - db.put(columnFamilyHandleList.get(0), new WriteOptions(), - "key1".getBytes(), "value".getBytes()); - db.put("key2".getBytes(), "12345678".getBytes()); - byte[] outValue = new byte[5]; - // not found value - int getResult = db.get("keyNotFound".getBytes(), outValue); - assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); - // found value which fits in outValue - getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("value".getBytes()); - // found value which fits partially - getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), - "key2".getBytes(), outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("12345".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList = new ArrayList<>(); + + // Test open database with column family names + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList)) { + try { + db.put(columnFamilyHandleList.get(0), new WriteOptions(), + "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + final byte[] outValue = new byte[5]; + // not found value + int getResult = db.get("keyNotFound".getBytes(), outValue); + assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); + // found value which fits in outValue + getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), + outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("value".getBytes()); + // found value which fits partially + getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), + "key2".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("12345".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test public void createWriteDropColumnFamily() throws RocksDBException { - RocksDB db = null; - DBOptions opt = null; - ColumnFamilyHandle tmpColumnFamilyHandle = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - opt = new DBOptions(); - opt.setCreateIfMissing(true); - opt.setCreateMissingColumnFamilies(true); - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - tmpColumnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); - db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); - db.dropColumnFamily(tmpColumnFamilyHandle); - tmpColumnFamilyHandle.dispose(); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (tmpColumnFamilyHandle != null) { - tmpColumnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList)) { + ColumnFamilyHandle tmpColumnFamilyHandle = null; + try { + tmpColumnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF".getBytes(), + new ColumnFamilyOptions())); + db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); + db.dropColumnFamily(tmpColumnFamilyHandle); + } finally { + if (tmpColumnFamilyHandle != null) { + tmpColumnFamilyHandle.close(); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test public void writeBatch() throws RocksDBException { - RocksDB db = null; - DBOptions opt = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - opt = new DBOptions(); - opt.setCreateIfMissing(true); - opt.setCreateMissingColumnFamilies(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions().setMergeOperator(new StringAppendOperator()))); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - - WriteBatch writeBatch = new WriteBatch(); - WriteOptions writeOpt = new WriteOptions(); - writeBatch.put("key".getBytes(), "value".getBytes()); - writeBatch.put(db.getDefaultColumnFamily(), - "mergeKey".getBytes(), "merge".getBytes()); - writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), - "merge".getBytes()); - writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), - "value2".getBytes()); - writeBatch.remove("xyz".getBytes()); - writeBatch.remove(columnFamilyHandleList.get(1), "xyz".getBytes()); - db.write(writeOpt, writeBatch); - writeBatch.dispose(); - assertThat(db.get(columnFamilyHandleList.get(1), - "xyz".getBytes()) == null); - assertThat(new String(db.get(columnFamilyHandleList.get(1), - "newcfkey".getBytes()))).isEqualTo("value"); - assertThat(new String(db.get(columnFamilyHandleList.get(1), - "newcfkey2".getBytes()))).isEqualTo("value2"); - assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); - // check if key is merged - assertThat(new String(db.get(db.getDefaultColumnFamily(), - "mergeKey".getBytes()))).isEqualTo("merge,merge"); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + try (final ColumnFamilyOptions defaultCfOptions = new ColumnFamilyOptions() + .setMergeOperator(new StringAppendOperator())) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + defaultCfOptions), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList); + final WriteBatch writeBatch = new WriteBatch(); + final WriteOptions writeOpt = new WriteOptions()) { + try { + writeBatch.put("key".getBytes(), "value".getBytes()); + writeBatch.put(db.getDefaultColumnFamily(), + "mergeKey".getBytes(), "merge".getBytes()); + writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), + "merge".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), + "value".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), + "value2".getBytes()); + writeBatch.remove("xyz".getBytes()); + writeBatch.remove(columnFamilyHandleList.get(1), "xyz".getBytes()); + db.write(writeOpt, writeBatch); + + assertThat(db.get(columnFamilyHandleList.get(1), + "xyz".getBytes()) == null); + assertThat(new String(db.get(columnFamilyHandleList.get(1), + "newcfkey".getBytes()))).isEqualTo("value"); + assertThat(new String(db.get(columnFamilyHandleList.get(1), + "newcfkey2".getBytes()))).isEqualTo("value2"); + assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); + // check if key is merged + assertThat(new String(db.get(db.getDefaultColumnFamily(), + "mergeKey".getBytes()))).isEqualTo("merge,merge"); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test public void iteratorOnColumnFamily() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - RocksIterator rocksIterator = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - options.setCreateMissingColumnFamilies(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), - "value2".getBytes()); - rocksIterator = db.newIterator( - columnFamilyHandleList.get(1)); - rocksIterator.seekToFirst(); - Map refMap = new HashMap<>(); - refMap.put("newcfkey", "value"); - refMap.put("newcfkey2", "value2"); - int i = 0; - while (rocksIterator.isValid()) { - i++; - assertThat(refMap.get(new String(rocksIterator.key()))). - isEqualTo(new String(rocksIterator.value())); - rocksIterator.next(); - } - assertThat(i).isEqualTo(2); - rocksIterator.dispose(); - } finally { - if (rocksIterator != null) { - rocksIterator.dispose(); - } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try { + + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), + "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), + "value2".getBytes()); + try (final RocksIterator rocksIterator = + db.newIterator(columnFamilyHandleList.get(1))) { + rocksIterator.seekToFirst(); + Map refMap = new HashMap<>(); + refMap.put("newcfkey", "value"); + refMap.put("newcfkey2", "value2"); + int i = 0; + while (rocksIterator.isValid()) { + i++; + assertThat(refMap.get(new String(rocksIterator.key()))). + isEqualTo(new String(rocksIterator.value())); + rocksIterator.next(); + } + assertThat(i).isEqualTo(2); + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test public void multiGet() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfDescriptors = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - options.setCreateMissingColumnFamilies(true); - - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, columnFamilyHandleList); - db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); - - List keys = new ArrayList<>(); - keys.add("key".getBytes()); - keys.add("newcfkey".getBytes()); - Map retValues = db.multiGet(columnFamilyHandleList, keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(keys.get(0)))) - .isEqualTo("value"); - assertThat(new String(retValues.get(keys.get(1)))) - .isEqualTo("value"); - retValues = db.multiGet(new ReadOptions(), columnFamilyHandleList, keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(keys.get(0)))) - .isEqualTo("value"); - assertThat(new String(retValues.get(keys.get(1)))) - .isEqualTo("value"); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try { + db.put(columnFamilyHandleList.get(0), "key".getBytes(), + "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), + "value".getBytes()); + + final List keys = Arrays.asList(new byte[][]{ + "key".getBytes(), "newcfkey".getBytes() + }); + Map retValues = db.multiGet(columnFamilyHandleList, + keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(keys.get(0)))) + .isEqualTo("value"); + assertThat(new String(retValues.get(keys.get(1)))) + .isEqualTo("value"); + retValues = db.multiGet(new ReadOptions(), columnFamilyHandleList, + keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(keys.get(0)))) + .isEqualTo("value"); + assertThat(new String(retValues.get(keys.get(1)))) + .isEqualTo("value"); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test public void properties() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - options.setCreateMissingColumnFamilies(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - assertThat(db.getProperty("rocksdb.estimate-num-keys")). - isNotNull(); - assertThat(db.getLongProperty(columnFamilyHandleList.get(0), - "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0); - assertThat(db.getProperty("rocksdb.stats")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(0), - "rocksdb.sstables")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.estimate-num-keys")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.stats")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.sstables")).isNotNull(); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try { + assertThat(db.getProperty("rocksdb.estimate-num-keys")). + isNotNull(); + assertThat(db.getLongProperty(columnFamilyHandleList.get(0), + "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0); + assertThat(db.getProperty("rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(0), + "rocksdb.sstables")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), + "rocksdb.estimate-num-keys")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), + "rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), + "rocksdb.sstables")).isNotNull(); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @@ -458,289 +376,230 @@ public void properties() throws RocksDBException { @Test public void iterators() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - List iterators = null; - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - options.setCreateMissingColumnFamilies(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - iterators = db.newIterators(columnFamilyHandleList); - assertThat(iterators.size()).isEqualTo(2); - RocksIterator iter = iterators.get(0); - iter.seekToFirst(); - Map defRefMap = new HashMap<>(); - defRefMap.put("dfkey1", "dfvalue"); - defRefMap.put("key", "value"); - while (iter.isValid()) { - assertThat(defRefMap.get(new String(iter.key()))). - isEqualTo(new String(iter.value())); - iter.next(); - } - // iterate over new_cf key/value pairs - Map cfRefMap = new HashMap<>(); - cfRefMap.put("newcfkey", "value"); - cfRefMap.put("newcfkey2", "value2"); - iter = iterators.get(1); - iter.seekToFirst(); - while (iter.isValid()) { - assertThat(cfRefMap.get(new String(iter.key()))). - isEqualTo(new String(iter.value())); - iter.next(); - } - } finally { - if (iterators != null) { - for (RocksIterator rocksIterator : iterators) { - rocksIterator.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList)) { + List iterators = null; + try { + iterators = db.newIterators(columnFamilyHandleList); + assertThat(iterators.size()).isEqualTo(2); + RocksIterator iter = iterators.get(0); + iter.seekToFirst(); + final Map defRefMap = new HashMap<>(); + defRefMap.put("dfkey1", "dfvalue"); + defRefMap.put("key", "value"); + while (iter.isValid()) { + assertThat(defRefMap.get(new String(iter.key()))). + isEqualTo(new String(iter.value())); + iter.next(); + } + // iterate over new_cf key/value pairs + final Map cfRefMap = new HashMap<>(); + cfRefMap.put("newcfkey", "value"); + cfRefMap.put("newcfkey2", "value2"); + iter = iterators.get(1); + iter.seekToFirst(); + while (iter.isValid()) { + assertThat(cfRefMap.get(new String(iter.key()))). + isEqualTo(new String(iter.value())); + iter.next(); + } + } finally { + if (iterators != null) { + for (final RocksIterator rocksIterator : iterators) { + rocksIterator.close(); + } + } + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); } - } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); } } } @Test(expected = RocksDBException.class) public void failPutDisposedCF() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try { + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.put(columnFamilyHandleList.get(1), "key".getBytes(), + "value".getBytes()); + } finally { + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test(expected = RocksDBException.class) public void failRemoveDisposedCF() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.remove(columnFamilyHandleList.get(1), "key".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try { + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.remove(columnFamilyHandleList.get(1), "key".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test(expected = RocksDBException.class) public void failGetDisposedCF() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.get(columnFamilyHandleList.get(1), "key".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList)) { + try { + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.get(columnFamilyHandleList.get(1), "key".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test(expected = RocksDBException.class) public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true); - - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList); - List keys = new ArrayList<>(); - keys.add("key".getBytes()); - keys.add("newcfkey".getBytes()); - List cfCustomList = new ArrayList<>(); - db.multiGet(cfCustomList, keys); - - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList)) { + try { + final List keys = new ArrayList<>(); + keys.add("key".getBytes()); + keys.add("newcfkey".getBytes()); + final List cfCustomList = new ArrayList<>(); + db.multiGet(cfCustomList, keys); + + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @Test public void testByteCreateFolumnFamily() throws RocksDBException { - RocksDB db = null; - Options options = null; - ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null; - try { - options = new Options().setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - - byte[] b0 = new byte[] { (byte)0x00 }; - byte[] b1 = new byte[] { (byte)0x01 }; - byte[] b2 = new byte[] { (byte)0x02 }; - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); - List families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), b0, b1); - cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2)); - } finally { - if (cf1 != null) { - cf1.dispose(); - } - if (cf2 != null) { - cf2.dispose(); - } - if (cf3 != null) { - cf3.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { + final byte[] b0 = new byte[]{(byte) 0x00}; + final byte[] b1 = new byte[]{(byte) 0x01}; + final byte[] b2 = new byte[]{(byte) 0x02}; + ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null; + try { + cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + final List families = RocksDB.listColumnFamilies(options, + dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), b0, b1); + cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2)); + } finally { + if (cf1 != null) { + cf1.close(); + } + if (cf2 != null) { + cf2.close(); + } + if (cf3 != null) { + cf3.close(); + } } } } @Test public void testCFNamesWithZeroBytes() throws RocksDBException { - RocksDB db = null; - Options options = null; ColumnFamilyHandle cf1 = null, cf2 = null; - try { - options = new Options().setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - - byte[] b0 = new byte[] { 0, 0 }; - byte[] b1 = new byte[] { 0, 1 }; - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); - List families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), b0, b1); - } finally { - if (cf1 != null) { - cf1.dispose(); - } - if (cf2 != null) { - cf2.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + ) { + try { + final byte[] b0 = new byte[]{0, 0}; + final byte[] b1 = new byte[]{0, 1}; + cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + final List families = RocksDB.listColumnFamilies(options, + dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), b0, b1); + } finally { + if (cf1 != null) { + cf1.close(); + } + if (cf2 != null) { + cf2.close(); + } } } } @Test public void testCFNameSimplifiedChinese() throws RocksDBException { - RocksDB db = null; - Options options = null; ColumnFamilyHandle columnFamilyHandle = null; - try { - options = new Options().setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - final String simplifiedChinese = "\u7b80\u4f53\u5b57"; - columnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); - - List families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes()); - } finally { - if (columnFamilyHandle != null) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + ) { + try { + final String simplifiedChinese = "\u7b80\u4f53\u5b57"; + columnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); + + final List families = RocksDB.listColumnFamilies(options, + dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), + simplifiedChinese.getBytes()); + } finally { + if (columnFamilyHandle != null) { + columnFamilyHandle.close(); + } } } - - } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java index 4f8a7d1a6f..fcdd09acba 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,18 +18,15 @@ public class ComparatorOptionsTest { @Test public void comparatorOptions() { - final ComparatorOptions copt = new ComparatorOptions(); + try(final ComparatorOptions copt = new ComparatorOptions()) { - assertThat(copt).isNotNull(); - - { // UseAdaptiveMutex test + assertThat(copt).isNotNull(); + // UseAdaptiveMutex test copt.setUseAdaptiveMutex(true); assertThat(copt.useAdaptiveMutex()).isTrue(); copt.setUseAdaptiveMutex(false); assertThat(copt.useAdaptiveMutex()).isFalse(); } - - copt.dispose(); } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java index e689a9cf5d..b348218447 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -79,66 +79,52 @@ public int compare(final Slice a, final Slice b) { @Test public void builtinForwardComparator() throws RocksDBException { - Options options = null; - RocksDB rocksDB = null; - RocksIterator rocksIterator = null; - try { - options = new Options(); - options.setCreateIfMissing(true); - options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR); - rocksDB = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - - rocksDB.put("abc1".getBytes(), "abc1".getBytes()); - rocksDB.put("abc2".getBytes(), "abc2".getBytes()); - rocksDB.put("abc3".getBytes(), "abc3".getBytes()); - - rocksIterator = rocksDB.newIterator(); - // Iterate over keys using a iterator - rocksIterator.seekToFirst(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc1".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc1".getBytes()); - rocksIterator.next(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc2".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc2".getBytes()); - rocksIterator.next(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc3".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc3".getBytes()); - rocksIterator.next(); - assertThat(rocksIterator.isValid()).isFalse(); - // Get last one - rocksIterator.seekToLast(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc3".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc3".getBytes()); - // Seek for abc - rocksIterator.seek("abc".getBytes()); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc1".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc1".getBytes()); - - } finally { - if (rocksIterator != null) { - rocksIterator.dispose(); - } - if (rocksDB != null) { - rocksDB.close(); - } - if (options != null) { - options.dispose(); + try (final Options options = new Options() + .setCreateIfMissing(true) + .setComparator(BuiltinComparator.BYTEWISE_COMPARATOR); + final RocksDB rocksDb = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { + rocksDb.put("abc1".getBytes(), "abc1".getBytes()); + rocksDb.put("abc2".getBytes(), "abc2".getBytes()); + rocksDb.put("abc3".getBytes(), "abc3".getBytes()); + + try(final RocksIterator rocksIterator = rocksDb.newIterator()) { + // Iterate over keys using a iterator + rocksIterator.seekToFirst(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc2".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc2".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isFalse(); + // Get last one + rocksIterator.seekToLast(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + // Seek for abc + rocksIterator.seek("abc".getBytes()); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); } } } @@ -146,69 +132,56 @@ public void builtinForwardComparator() @Test public void builtinReverseComparator() throws RocksDBException { - Options options = null; - RocksDB rocksDB = null; - RocksIterator rocksIterator = null; - try { - options = new Options(); - options.setCreateIfMissing(true); - options.setComparator( - BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR); - rocksDB = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - - rocksDB.put("abc1".getBytes(), "abc1".getBytes()); - rocksDB.put("abc2".getBytes(), "abc2".getBytes()); - rocksDB.put("abc3".getBytes(), "abc3".getBytes()); - - rocksIterator = rocksDB.newIterator(); - // Iterate over keys using a iterator - rocksIterator.seekToFirst(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc3".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc3".getBytes()); - rocksIterator.next(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc2".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc2".getBytes()); - rocksIterator.next(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc1".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc1".getBytes()); - rocksIterator.next(); - assertThat(rocksIterator.isValid()).isFalse(); - // Get last one - rocksIterator.seekToLast(); - assertThat(rocksIterator.isValid()).isTrue(); - assertThat(rocksIterator.key()).isEqualTo( - "abc1".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc1".getBytes()); - // Will be invalid because abc is after abc1 - rocksIterator.seek("abc".getBytes()); - assertThat(rocksIterator.isValid()).isFalse(); - // Will be abc3 because the next one after abc999 - // is abc3 - rocksIterator.seek("abc999".getBytes()); - assertThat(rocksIterator.key()).isEqualTo( - "abc3".getBytes()); - assertThat(rocksIterator.value()).isEqualTo( - "abc3".getBytes()); - } finally { - if (rocksIterator != null) { - rocksIterator.dispose(); - } - if (rocksDB != null) { - rocksDB.close(); - } - if (options != null) { - options.dispose(); + try (final Options options = new Options() + .setCreateIfMissing(true) + .setComparator(BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR); + final RocksDB rocksDb = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { + + rocksDb.put("abc1".getBytes(), "abc1".getBytes()); + rocksDb.put("abc2".getBytes(), "abc2".getBytes()); + rocksDb.put("abc3".getBytes(), "abc3".getBytes()); + + try (final RocksIterator rocksIterator = rocksDb.newIterator()) { + // Iterate over keys using a iterator + rocksIterator.seekToFirst(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc2".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc2".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isFalse(); + // Get last one + rocksIterator.seekToLast(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + // Will be invalid because abc is after abc1 + rocksIterator.seek("abc".getBytes()); + assertThat(rocksIterator.isValid()).isFalse(); + // Will be abc3 because the next one after abc999 + // is abc3 + rocksIterator.seek("abc999".getBytes()); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java index bff4d5f6c2..51b7259f6a 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,11 +8,10 @@ import org.junit.Test; -public class CompressionOptionsTest -{ +public class CompressionOptionsTest { @Test public void getCompressionType() { - for (CompressionType compressionType : CompressionType.values()) { + for (final CompressionType compressionType : CompressionType.values()) { String libraryName = compressionType.getLibraryName(); compressionType.equals(CompressionType.getCompressionType( libraryName)); diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java index 98ba4ce381..523e537840 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,547 +24,339 @@ public class DBOptionsTest { @Test public void getDBOptionsFromProps() { - DBOptions opt = null; - try { - // setup sample properties - Properties properties = new Properties(); - properties.put("allow_mmap_reads", "true"); - properties.put("bytes_per_sync", "13"); - opt = DBOptions.getDBOptionsFromProps(properties); + // setup sample properties + final Properties properties = new Properties(); + properties.put("allow_mmap_reads", "true"); + properties.put("bytes_per_sync", "13"); + try(final DBOptions opt = DBOptions.getDBOptionsFromProps(properties)) { assertThat(opt).isNotNull(); assertThat(String.valueOf(opt.allowMmapReads())). isEqualTo(properties.get("allow_mmap_reads")); assertThat(String.valueOf(opt.bytesPerSync())). isEqualTo(properties.get("bytes_per_sync")); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void failDBOptionsFromPropsWithIllegalValue() { - DBOptions opt = null; - try { - // setup sample properties - Properties properties = new Properties(); - properties.put("tomato", "1024"); - properties.put("burger", "2"); - opt = DBOptions. - getDBOptionsFromProps(properties); + // setup sample properties + final Properties properties = new Properties(); + properties.put("tomato", "1024"); + properties.put("burger", "2"); + try(final DBOptions opt = DBOptions.getDBOptionsFromProps(properties)) { assertThat(opt).isNull(); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test(expected = IllegalArgumentException.class) public void failDBOptionsFromPropsWithNullValue() { - DBOptions.getDBOptionsFromProps(null); + try(final DBOptions opt = DBOptions.getDBOptionsFromProps(null)) { + //no-op + } } @Test(expected = IllegalArgumentException.class) public void failDBOptionsFromPropsWithEmptyProps() { - DBOptions.getDBOptionsFromProps( - new Properties()); + try(final DBOptions opt = DBOptions.getDBOptionsFromProps( + new Properties())) { + //no-op + } } @Test public void setIncreaseParallelism() { - DBOptions opt = null; - try { - opt = new DBOptions(); + try(final DBOptions opt = new DBOptions()) { final int threads = Runtime.getRuntime().availableProcessors() * 2; opt.setIncreaseParallelism(threads); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void createIfMissing() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setCreateIfMissing(boolValue); - assertThat(opt.createIfMissing()). - isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.createIfMissing()).isEqualTo(boolValue); } } @Test public void createMissingColumnFamilies() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setCreateMissingColumnFamilies(boolValue); - assertThat(opt.createMissingColumnFamilies()). - isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.createMissingColumnFamilies()).isEqualTo(boolValue); } } @Test public void errorIfExists() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setErrorIfExists(boolValue); assertThat(opt.errorIfExists()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void paranoidChecks() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setParanoidChecks(boolValue); - assertThat(opt.paranoidChecks()). - isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.paranoidChecks()).isEqualTo(boolValue); } } @Test public void maxTotalWalSize() { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setMaxTotalWalSize(longValue); - assertThat(opt.maxTotalWalSize()). - isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.maxTotalWalSize()).isEqualTo(longValue); } } @Test public void maxOpenFiles() { - DBOptions opt = null; - try { - opt = new DBOptions(); - int intValue = rand.nextInt(); + try(final DBOptions opt = new DBOptions()) { + final int intValue = rand.nextInt(); opt.setMaxOpenFiles(intValue); assertThat(opt.maxOpenFiles()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void disableDataSync() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setDisableDataSync(boolValue); - assertThat(opt.disableDataSync()). - isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.disableDataSync()).isEqualTo(boolValue); } } @Test public void useFsync() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setUseFsync(boolValue); assertThat(opt.useFsync()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void dbLogDir() { - DBOptions opt = null; - try { - opt = new DBOptions(); - String str = "path/to/DbLogDir"; + try(final DBOptions opt = new DBOptions()) { + final String str = "path/to/DbLogDir"; opt.setDbLogDir(str); assertThat(opt.dbLogDir()).isEqualTo(str); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void walDir() { - DBOptions opt = null; - try { - opt = new DBOptions(); - String str = "path/to/WalDir"; + try(final DBOptions opt = new DBOptions()) { + final String str = "path/to/WalDir"; opt.setWalDir(str); assertThat(opt.walDir()).isEqualTo(str); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void deleteObsoleteFilesPeriodMicros() { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setDeleteObsoleteFilesPeriodMicros(longValue); - assertThat(opt.deleteObsoleteFilesPeriodMicros()). - isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.deleteObsoleteFilesPeriodMicros()).isEqualTo(longValue); } } @Test public void maxBackgroundCompactions() { - DBOptions opt = null; - try { - opt = new DBOptions(); - int intValue = rand.nextInt(); + try(final DBOptions opt = new DBOptions()) { + final int intValue = rand.nextInt(); opt.setMaxBackgroundCompactions(intValue); - assertThat(opt.maxBackgroundCompactions()). - isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.maxBackgroundCompactions()).isEqualTo(intValue); } } @Test public void maxBackgroundFlushes() { - DBOptions opt = null; - try { - opt = new DBOptions(); - int intValue = rand.nextInt(); + try(final DBOptions opt = new DBOptions()) { + final int intValue = rand.nextInt(); opt.setMaxBackgroundFlushes(intValue); - assertThat(opt.maxBackgroundFlushes()). - isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.maxBackgroundFlushes()).isEqualTo(intValue); } } @Test public void maxLogFileSize() throws RocksDBException { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setMaxLogFileSize(longValue); assertThat(opt.maxLogFileSize()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void logFileTimeToRoll() throws RocksDBException { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setLogFileTimeToRoll(longValue); - assertThat(opt.logFileTimeToRoll()). - isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.logFileTimeToRoll()).isEqualTo(longValue); } } @Test public void keepLogFileNum() throws RocksDBException { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setKeepLogFileNum(longValue); assertThat(opt.keepLogFileNum()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxManifestFileSize() { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setMaxManifestFileSize(longValue); - assertThat(opt.maxManifestFileSize()). - isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.maxManifestFileSize()).isEqualTo(longValue); } } @Test public void tableCacheNumshardbits() { - DBOptions opt = null; - try { - opt = new DBOptions(); - int intValue = rand.nextInt(); + try(final DBOptions opt = new DBOptions()) { + final int intValue = rand.nextInt(); opt.setTableCacheNumshardbits(intValue); - assertThat(opt.tableCacheNumshardbits()). - isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.tableCacheNumshardbits()).isEqualTo(intValue); } } @Test public void walSizeLimitMB() { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setWalSizeLimitMB(longValue); assertThat(opt.walSizeLimitMB()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void walTtlSeconds() { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setWalTtlSeconds(longValue); assertThat(opt.walTtlSeconds()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void manifestPreallocationSize() throws RocksDBException { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setManifestPreallocationSize(longValue); - assertThat(opt.manifestPreallocationSize()). - isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } + assertThat(opt.manifestPreallocationSize()).isEqualTo(longValue); } } @Test public void allowOsBuffer() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setAllowOsBuffer(boolValue); assertThat(opt.allowOsBuffer()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void allowMmapReads() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setAllowMmapReads(boolValue); assertThat(opt.allowMmapReads()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void allowMmapWrites() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setAllowMmapWrites(boolValue); assertThat(opt.allowMmapWrites()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void isFdCloseOnExec() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setIsFdCloseOnExec(boolValue); assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void statsDumpPeriodSec() { - DBOptions opt = null; - try { - opt = new DBOptions(); - int intValue = rand.nextInt(); + try(final DBOptions opt = new DBOptions()) { + final int intValue = rand.nextInt(); opt.setStatsDumpPeriodSec(intValue); assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void adviseRandomOnOpen() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setAdviseRandomOnOpen(boolValue); assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void useAdaptiveMutex() { - DBOptions opt = null; - try { - opt = new DBOptions(); - boolean boolValue = rand.nextBoolean(); + try(final DBOptions opt = new DBOptions()) { + final boolean boolValue = rand.nextBoolean(); opt.setUseAdaptiveMutex(boolValue); assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void bytesPerSync() { - DBOptions opt = null; - try { - opt = new DBOptions(); - long longValue = rand.nextLong(); + try(final DBOptions opt = new DBOptions()) { + final long longValue = rand.nextLong(); opt.setBytesPerSync(longValue); assertThat(opt.bytesPerSync()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void rateLimiterConfig() { - DBOptions options = null; - DBOptions anotherOptions = null; - try { - options = new DBOptions(); - RateLimiterConfig rateLimiterConfig = + try(final DBOptions options = new DBOptions(); + final DBOptions anotherOptions = new DBOptions()) { + final RateLimiterConfig rateLimiterConfig = new GenericRateLimiterConfig(1000, 100 * 1000, 1); options.setRateLimiterConfig(rateLimiterConfig); // Test with parameter initialization - anotherOptions = new DBOptions(); + anotherOptions.setRateLimiterConfig( new GenericRateLimiterConfig(1000)); - } finally { - if (options != null) { - options.dispose(); - } - if (anotherOptions != null) { - anotherOptions.dispose(); - } } } @Test public void statistics() { - DBOptions options = new DBOptions(); - Statistics statistics = options.createStatistics(). - statisticsPtr(); - assertThat(statistics).isNotNull(); - - DBOptions anotherOptions = new DBOptions(); - statistics = anotherOptions.statisticsPtr(); - assertThat(statistics).isNotNull(); + try(final DBOptions options = new DBOptions()) { + Statistics statistics = options.createStatistics(). + statisticsPtr(); + assertThat(statistics).isNotNull(); + + try(final DBOptions anotherOptions = new DBOptions()) { + statistics = anotherOptions.statisticsPtr(); + assertThat(statistics).isNotNull(); + } + } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java index be84d66472..abdbeada9e 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java index 123eed2e7c..2d3abea45b 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,11 +18,8 @@ public class DirectSliceTest { @Test public void directSlice() { - DirectSlice directSlice = null; - DirectSlice otherSlice = null; - try { - directSlice = new DirectSlice("abc"); - otherSlice = new DirectSlice("abc"); + try(final DirectSlice directSlice = new DirectSlice("abc"); + final DirectSlice otherSlice = new DirectSlice("abc")) { assertThat(directSlice.toString()).isEqualTo("abc"); // clear first slice directSlice.clear(); @@ -32,75 +29,46 @@ public void directSlice() { // remove prefix otherSlice.removePrefix(1); assertThat(otherSlice.toString()).isEqualTo("bc"); - } finally { - if (directSlice != null) { - directSlice.dispose(); - } - if (otherSlice != null) { - otherSlice.dispose(); - } } } @Test public void directSliceWithByteBuffer() { - DirectSlice directSlice = null; - try { - byte[] data = "Some text".getBytes(); - ByteBuffer buffer = ByteBuffer.allocateDirect(data.length + 1); - buffer.put(data); - buffer.put(data.length, (byte)0); + final byte[] data = "Some text".getBytes(); + final ByteBuffer buffer = ByteBuffer.allocateDirect(data.length + 1); + buffer.put(data); + buffer.put(data.length, (byte)0); - directSlice = new DirectSlice(buffer); + try(final DirectSlice directSlice = new DirectSlice(buffer)) { assertThat(directSlice.toString()).isEqualTo("Some text"); - } finally { - if (directSlice != null) { - directSlice.dispose(); - } } } @Test public void directSliceWithByteBufferAndLength() { - DirectSlice directSlice = null; - try { - byte[] data = "Some text".getBytes(); - ByteBuffer buffer = ByteBuffer.allocateDirect(data.length); - buffer.put(data); - directSlice = new DirectSlice(buffer, 4); + final byte[] data = "Some text".getBytes(); + final ByteBuffer buffer = ByteBuffer.allocateDirect(data.length); + buffer.put(data); + try(final DirectSlice directSlice = new DirectSlice(buffer, 4)) { assertThat(directSlice.toString()).isEqualTo("Some"); - } finally { - if (directSlice != null) { - directSlice.dispose(); - } } } @Test(expected = AssertionError.class) public void directSliceInitWithoutDirectAllocation() { - DirectSlice directSlice = null; - try { - byte[] data = "Some text".getBytes(); - ByteBuffer buffer = ByteBuffer.wrap(data); - directSlice = new DirectSlice(buffer); - } finally { - if (directSlice != null) { - directSlice.dispose(); - } + final byte[] data = "Some text".getBytes(); + final ByteBuffer buffer = ByteBuffer.wrap(data); + try(final DirectSlice directSlice = new DirectSlice(buffer)) { + //no-op } } @Test(expected = AssertionError.class) public void directSlicePrefixInitWithoutDirectAllocation() { - DirectSlice directSlice = null; - try { - byte[] data = "Some text".getBytes(); - ByteBuffer buffer = ByteBuffer.wrap(data); - directSlice = new DirectSlice(buffer, 4); - } finally { - if (directSlice != null) { - directSlice.dispose(); - } + final byte[] data = "Some text".getBytes(); + final ByteBuffer buffer = ByteBuffer.wrap(data); + try(final DirectSlice directSlice = new DirectSlice(buffer, 4)) { + //no-op } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java index 36ce379709..e5bb60fda4 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,31 +16,23 @@ public class FilterTest { @Test public void filter() { - Options options = null; - try { - options = new Options(); - // test table config - options.setTableFormatConfig(new BlockBasedTableConfig(). - setFilter(new BloomFilter())); - options.dispose(); - System.gc(); - System.runFinalization(); - // new Bloom filter - options = new Options(); - BlockBasedTableConfig blockConfig = new BlockBasedTableConfig(); - blockConfig.setFilter(new BloomFilter()); - options.setTableFormatConfig(blockConfig); - BloomFilter bloomFilter = new BloomFilter(10); - blockConfig.setFilter(bloomFilter); - options.setTableFormatConfig(blockConfig); - System.gc(); - System.runFinalization(); - blockConfig.setFilter(new BloomFilter(10, false)); - options.setTableFormatConfig(blockConfig); - - } finally { - if (options != null) { - options.dispose(); + // new Bloom filter + final BlockBasedTableConfig blockConfig = new BlockBasedTableConfig(); + try(final Options options = new Options()) { + + try(final Filter bloomFilter = new BloomFilter()) { + blockConfig.setFilter(bloomFilter); + options.setTableFormatConfig(blockConfig); + } + + try(final Filter bloomFilter = new BloomFilter(10)) { + blockConfig.setFilter(bloomFilter); + options.setTableFormatConfig(blockConfig); + } + + try(final Filter bloomFilter = new BloomFilter(10, false)) { + blockConfig.setFilter(bloomFilter); + options.setTableFormatConfig(blockConfig); } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java index 94a32d3832..f3530292ae 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,44 +22,28 @@ public class FlushTest { @Test public void flush() throws RocksDBException { - RocksDB db = null; - Options options = null; - WriteOptions wOpt = null; - FlushOptions flushOptions = null; - try { - options = new Options(); - // Setup options - options.setCreateIfMissing(true); - options.setMaxWriteBufferNumber(10); - options.setMinWriteBufferNumberToMerge(10); - wOpt = new WriteOptions(); - flushOptions = new FlushOptions(); - flushOptions.setWaitForFlush(true); + try(final Options options = new Options() + .setCreateIfMissing(true) + .setMaxWriteBufferNumber(10) + .setMinWriteBufferNumberToMerge(10); + final WriteOptions wOpt = new WriteOptions() + .setDisableWAL(true); + final FlushOptions flushOptions = new FlushOptions() + .setWaitForFlush(true)) { assertThat(flushOptions.waitForFlush()).isTrue(); - wOpt.setDisableWAL(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - db.put(wOpt, "key1".getBytes(), "value1".getBytes()); - db.put(wOpt, "key2".getBytes(), "value2".getBytes()); - db.put(wOpt, "key3".getBytes(), "value3".getBytes()); - db.put(wOpt, "key4".getBytes(), "value4".getBytes()); - assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")).isEqualTo("4"); - db.flush(flushOptions); - assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")). - isEqualTo("0"); - } finally { - if (flushOptions != null) { - flushOptions.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (wOpt != null) { - wOpt.dispose(); - } + try(final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + db.put(wOpt, "key1".getBytes(), "value1".getBytes()); + db.put(wOpt, "key2".getBytes(), "value2".getBytes()); + db.put(wOpt, "key3".getBytes(), "value3".getBytes()); + db.put(wOpt, "key4".getBytes(), "value4".getBytes()); + assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")) + .isEqualTo("4"); + db.flush(flushOptions); + assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")) + .isEqualTo("0"); + } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java index 630666b903..48ecfa16a9 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java @@ -4,6 +4,7 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.Environment; import java.io.IOException; @@ -23,81 +24,52 @@ public class InfoLogLevelTest { @Test public void testInfoLogLevel() throws RocksDBException, IOException { - RocksDB db = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + try (final RocksDB db = + RocksDB.open(dbFolder.getRoot().getAbsolutePath())) { db.put("key".getBytes(), "value".getBytes()); assertThat(getLogContentsWithoutHeader()).isNotEmpty(); - } finally { - if (db != null) { - db.close(); - } } } @Test - public void testFatalLogLevel() throws RocksDBException, + public void testFatalLogLevel() throws RocksDBException, IOException { - RocksDB db = null; - Options options = null; - try { - options = new Options(). - setCreateIfMissing(true). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL); + try (final Options options = new Options(). + setCreateIfMissing(true). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { assertThat(options.infoLogLevel()). isEqualTo(InfoLogLevel.FATAL_LEVEL); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); db.put("key".getBytes(), "value".getBytes()); // As InfoLogLevel is set to FATAL_LEVEL, here we expect the log // content to be empty. assertThat(getLogContentsWithoutHeader()).isEmpty(); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } } } @Test public void testFatalLogLevelWithDBOptions() throws RocksDBException, IOException { - RocksDB db = null; - Options options = null; - DBOptions dbOptions = null; - try { - dbOptions = new DBOptions(). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL); - options = new Options(dbOptions, - new ColumnFamilyOptions()). - setCreateIfMissing(true); + try (final DBOptions dbOptions = new DBOptions(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL); + final Options options = new Options(dbOptions, + new ColumnFamilyOptions()). + setCreateIfMissing(true); + final RocksDB db = + RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { assertThat(dbOptions.infoLogLevel()). isEqualTo(InfoLogLevel.FATAL_LEVEL); assertThat(options.infoLogLevel()). isEqualTo(InfoLogLevel.FATAL_LEVEL); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); db.put("key".getBytes(), "value".getBytes()); assertThat(getLogContentsWithoutHeader()).isEmpty(); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (dbOptions != null) { - dbOptions.dispose(); - } } } @Test(expected = IllegalArgumentException.class) public void failIfIllegalByteValueProvided() { - InfoLogLevel.getInfoLogLevel((byte)-1); + InfoLogLevel.getInfoLogLevel((byte) -1); } @Test @@ -113,9 +85,10 @@ public void valueOf() { * @throws IOException if file is not found. */ private String getLogContentsWithoutHeader() throws IOException { - final String separator = System.getProperty("line.separator"); + final String separator = Environment.isWindows() ? + "\n" : System.getProperty("line.separator"); final String[] lines = new String(readAllBytes(get( - dbFolder.getRoot().getAbsolutePath()+ "/LOG"))).split(separator); + dbFolder.getRoot().getAbsolutePath() + "/LOG"))).split(separator); int first_non_header = lines.length; // Identify the last line of the header diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java index b670caddcf..bc341c9d21 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,6 +10,7 @@ import org.junit.rules.TemporaryFolder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; @@ -25,70 +26,61 @@ public class KeyMayExistTest { @Test public void keyMayExist() throws RocksDBException { - RocksDB db = null; - DBOptions options = null; - List cfDescriptors = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - options = new DBOptions(); - options.setCreateIfMissing(true) - .setCreateMissingColumnFamilies(true); - // open database using cf names + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes()) + ); - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, columnFamilyHandleList); - assertThat(columnFamilyHandleList.size()). - isEqualTo(2); - db.put("key".getBytes(), "value".getBytes()); - // Test without column family - StringBuffer retValue = new StringBuffer(); - boolean exists = db.keyMayExist("key".getBytes(), retValue); - assertThat(exists).isTrue(); - assertThat(retValue.toString()). - isEqualTo("value"); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try { + assertThat(columnFamilyHandleList.size()). + isEqualTo(2); + db.put("key".getBytes(), "value".getBytes()); + // Test without column family + StringBuffer retValue = new StringBuffer(); + boolean exists = db.keyMayExist("key".getBytes(), retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()).isEqualTo("value"); - // Test without column family but with readOptions - retValue = new StringBuffer(); - exists = db.keyMayExist(new ReadOptions(), "key".getBytes(), - retValue); - assertThat(exists).isTrue(); - assertThat(retValue.toString()). - isEqualTo("value"); + // Test without column family but with readOptions + try (final ReadOptions readOptions = new ReadOptions()) { + retValue = new StringBuffer(); + exists = db.keyMayExist(readOptions, "key".getBytes(), retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()).isEqualTo("value"); + } - // Test with column family - retValue = new StringBuffer(); - exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(), - retValue); - assertThat(exists).isTrue(); - assertThat(retValue.toString()). - isEqualTo("value"); + // Test with column family + retValue = new StringBuffer(); + exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(), + retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()).isEqualTo("value"); - // Test with column family and readOptions - retValue = new StringBuffer(); - exists = db.keyMayExist(new ReadOptions(), - columnFamilyHandleList.get(0), "key".getBytes(), - retValue); - assertThat(exists).isTrue(); - assertThat(retValue.toString()). - isEqualTo("value"); + // Test with column family and readOptions + try (final ReadOptions readOptions = new ReadOptions()) { + retValue = new StringBuffer(); + exists = db.keyMayExist(readOptions, + columnFamilyHandleList.get(0), "key".getBytes(), + retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()).isEqualTo("value"); + } - // KeyMayExist in CF1 must return false - assertThat(db.keyMayExist(columnFamilyHandleList.get(1), - "key".getBytes(), retValue)).isFalse(); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + // KeyMayExist in CF1 must return false + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), + "key".getBytes(), retValue)).isFalse(); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java index 2eff3191a4..f83cff3b7f 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java @@ -6,6 +6,7 @@ import org.junit.rules.TemporaryFolder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; @@ -19,202 +20,219 @@ public class LoggerTest { @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); - private AtomicInteger logMessageCounter = new AtomicInteger(); - @Test public void customLogger() throws RocksDBException { - RocksDB db = null; - logMessageCounter.set(0); - try { - - // Setup options - final Options options = new Options(). - setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL). - setCreateIfMissing(true); - - // Create new logger with max log level passed by options - Logger logger = new Logger(options) { - @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { - assertThat(logMsg).isNotNull(); - assertThat(logMsg.length()).isGreaterThan(0); - logMessageCounter.incrementAndGet(); - } - }; - + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final Options options = new Options(). + setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL). + setCreateIfMissing(true); + final Logger logger = new Logger(options) { + // Create new logger with max log level passed by options + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { // Set custom logger to options options.setLogger(logger); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + // there should be more than zero received log messages in + // debug level. + assertThat(logMessageCounter.get()).isGreaterThan(0); + } + } + } + + @Test + public void warnLogger() throws RocksDBException { + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final Options options = new Options(). + setInfoLogLevel(InfoLogLevel.WARN_LEVEL). + setCreateIfMissing(true); + + final Logger logger = new Logger(options) { + // Create new logger with max log level passed by options + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { + + // Set custom logger to options + options.setLogger(logger); - // there should be more than zero received log messages in - // debug level. - assertThat(logMessageCounter.get()).isGreaterThan(0); - } finally { - if (db != null) { - db.close(); + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + // there should be zero messages + // using warn level as log level. + assertThat(logMessageCounter.get()).isEqualTo(0); } } - logMessageCounter.set(0); } @Test public void fatalLogger() throws RocksDBException { - RocksDB db = null; - logMessageCounter.set(0); - - try { - // Setup options - final Options options = new Options(). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). - setCreateIfMissing(true); - - // Create new logger with max log level passed by options - Logger logger = new Logger(options) { - @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { - assertThat(logMsg).isNotNull(); - assertThat(logMsg.length()).isGreaterThan(0); - logMessageCounter.incrementAndGet(); - } - }; + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final Options options = new Options(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). + setCreateIfMissing(true); + + final Logger logger = new Logger(options) { + // Create new logger with max log level passed by options + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { // Set custom logger to options options.setLogger(logger); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - - // there should be zero messages - // using fatal level as log level. - assertThat(logMessageCounter.get()).isEqualTo(0); - } finally { - if (db != null) { - db.close(); + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + // there should be zero messages + // using fatal level as log level. + assertThat(logMessageCounter.get()).isEqualTo(0); } } - logMessageCounter.set(0); } @Test public void dbOptionsLogger() throws RocksDBException { - RocksDB db = null; - Logger logger = null; - List cfHandles = new ArrayList<>(); - List cfDescriptors = new ArrayList<>(); - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - - logMessageCounter.set(0); - try { - // Setup options - final DBOptions options = new DBOptions(). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). - setCreateIfMissing(true); - - // Create new logger with max log level passed by options - logger = new Logger(options) { - @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { - assertThat(logMsg).isNotNull(); - assertThat(logMsg.length()).isGreaterThan(0); - logMessageCounter.incrementAndGet(); - } - }; - + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final DBOptions options = new DBOptions(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). + setCreateIfMissing(true); + final Logger logger = new Logger(options) { + // Create new logger with max log level passed by options + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { // Set custom logger to options options.setLogger(logger); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, cfHandles); - // there should be zero messages - // using fatal level as log level. - assertThat(logMessageCounter.get()).isEqualTo(0); - logMessageCounter.set(0); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : cfHandles) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (logger != null) { - logger.dispose(); + + final List cfDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List cfHandles = new ArrayList<>(); + + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, cfHandles)) { + try { + // there should be zero messages + // using fatal level as log level. + assertThat(logMessageCounter.get()).isEqualTo(0); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : cfHandles) { + columnFamilyHandle.close(); + } + } } } } + @Test + public void setWarnLogLevel() { + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final Options options = new Options(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). + setCreateIfMissing(true); + final Logger logger = new Logger(options) { + // Create new logger with max log level passed by options + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { + assertThat(logger.infoLogLevel()). + isEqualTo(InfoLogLevel.FATAL_LEVEL); + logger.setInfoLogLevel(InfoLogLevel.WARN_LEVEL); + assertThat(logger.infoLogLevel()). + isEqualTo(InfoLogLevel.WARN_LEVEL); + } + } + @Test public void setInfoLogLevel() { - Logger logger = null; - try { - // Setup options - final Options options = new Options(). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). - setCreateIfMissing(true); - - // Create new logger with max log level passed by options - logger = new Logger(options) { - @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { - assertThat(logMsg).isNotNull(); - assertThat(logMsg.length()).isGreaterThan(0); - logMessageCounter.incrementAndGet(); - } - }; + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final Options options = new Options(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). + setCreateIfMissing(true); + final Logger logger = new Logger(options) { + // Create new logger with max log level passed by options + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { assertThat(logger.infoLogLevel()). isEqualTo(InfoLogLevel.FATAL_LEVEL); logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL); assertThat(logger.infoLogLevel()). isEqualTo(InfoLogLevel.DEBUG_LEVEL); - } finally { - if (logger != null) { - logger.dispose(); - } } } @Test public void changeLogLevelAtRuntime() throws RocksDBException { - RocksDB db = null; - logMessageCounter.set(0); - - try { - // Setup options - final Options options = new Options(). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). - setCreateIfMissing(true); - - // Create new logger with max log level passed by options - Logger logger = new Logger(options) { - @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { - assertThat(logMsg).isNotNull(); - assertThat(logMsg.length()).isGreaterThan(0); - logMessageCounter.incrementAndGet(); - } - }; - + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final Options options = new Options(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). + setCreateIfMissing(true); + + // Create new logger with max log level passed by options + final Logger logger = new Logger(options) { + @Override + protected void log(InfoLogLevel infoLogLevel, String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + } + ) { // Set custom logger to options options.setLogger(logger); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - // there should be zero messages - // using fatal level as log level. - assertThat(logMessageCounter.get()).isEqualTo(0); + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { - // change log level to debug level - logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL); + // there should be zero messages + // using fatal level as log level. + assertThat(logMessageCounter.get()).isEqualTo(0); - db.put("key".getBytes(), "value".getBytes()); - db.flush(new FlushOptions().setWaitForFlush(true)); + // change log level to debug level + logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL); - // messages shall be received due to previous actions. - assertThat(logMessageCounter.get()).isNotEqualTo(0); + db.put("key".getBytes(), "value".getBytes()); + db.flush(new FlushOptions().setWaitForFlush(true)); - } finally { - if (db != null) { - db.close(); + // messages shall be received due to previous actions. + assertThat(logMessageCounter.get()).isNotEqualTo(0); } } - logMessageCounter.set(0); } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java index bfc898c42a..bbd5e2055b 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,9 +18,7 @@ public class MemTableTest { @Test public void hashSkipListMemTable() throws RocksDBException { - Options options = null; - try { - options = new Options(); + try(final Options options = new Options()) { // Test HashSkipListMemTableConfig HashSkipListMemTableConfig memTableConfig = new HashSkipListMemTableConfig(); @@ -40,18 +38,12 @@ public void hashSkipListMemTable() throws RocksDBException { assertThat(memTableConfig.branchingFactor()). isEqualTo(6); options.setMemTableConfig(memTableConfig); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void skipListMemTable() throws RocksDBException { - Options options = null; - try { - options = new Options(); + try(final Options options = new Options()) { SkipListMemTableConfig skipMemTableConfig = new SkipListMemTableConfig(); assertThat(skipMemTableConfig.lookahead()). @@ -60,19 +52,12 @@ public void skipListMemTable() throws RocksDBException { assertThat(skipMemTableConfig.lookahead()). isEqualTo(20); options.setMemTableConfig(skipMemTableConfig); - options.dispose(); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void hashLinkedListMemTable() throws RocksDBException { - Options options = null; - try { - options = new Options(); + try(final Options options = new Options()) { HashLinkedListMemTableConfig hashLinkedListMemTableConfig = new HashLinkedListMemTableConfig(); assertThat(hashLinkedListMemTableConfig.bucketCount()). @@ -107,18 +92,12 @@ public void hashLinkedListMemTable() throws RocksDBException { thresholdUseSkiplist()). isEqualTo(29); options.setMemTableConfig(hashLinkedListMemTableConfig); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void vectorMemTable() throws RocksDBException { - Options options = null; - try { - options = new Options(); + try(final Options options = new Options()) { VectorMemTableConfig vectorMemTableConfig = new VectorMemTableConfig(); assertThat(vectorMemTableConfig.reservedSize()). @@ -127,11 +106,6 @@ public void vectorMemTable() throws RocksDBException { assertThat(vectorMemTableConfig.reservedSize()). isEqualTo(123); options.setMemTableConfig(vectorMemTableConfig); - options.dispose(); - } finally { - if (options != null) { - options.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java index a5f8e1fe9d..d38df31958 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java @@ -1,10 +1,11 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. package org.rocksdb; +import java.util.Arrays; import java.util.List; import java.util.ArrayList; @@ -27,78 +28,60 @@ public class MergeTest { @Test public void stringOption() throws InterruptedException, RocksDBException { - RocksDB db = null; - Options opt = null; - try { - String db_path_string = - dbFolder.getRoot().getAbsolutePath(); - opt = new Options(); - opt.setCreateIfMissing(true); - opt.setMergeOperatorName("stringappend"); - - db = RocksDB.open(opt, db_path_string); + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // writing aa under key db.put("key".getBytes(), "aa".getBytes()); // merge bb under key db.merge("key".getBytes(), "bb".getBytes()); - byte[] value = db.get("key".getBytes()); - String strValue = new String(value); + final byte[] value = db.get("key".getBytes()); + final String strValue = new String(value); assertThat(strValue).isEqualTo("aa,bb"); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void cFStringOption() throws InterruptedException, RocksDBException { - RocksDB db = null; - DBOptions opt = null; - List columnFamilyHandleList = - new ArrayList<>(); - try { - String db_path_string = - dbFolder.getRoot().getAbsolutePath(); - opt = new DBOptions(); - opt.setCreateIfMissing(true); - opt.setCreateMissingColumnFamilies(true); - List cfDescriptors = - new ArrayList<>(); - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions().setMergeOperatorName( - "stringappend"))); - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions().setMergeOperatorName( - "stringappend"))); - db = RocksDB.open(opt, db_path_string, - cfDescriptors, columnFamilyHandleList); - - // writing aa under key - db.put(columnFamilyHandleList.get(1), - "cfkey".getBytes(), "aa".getBytes()); - // merge bb under key - db.merge(columnFamilyHandleList.get(1), - "cfkey".getBytes(), "bb".getBytes()); - - byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); - String strValue = new String(value); - assertThat(strValue).isEqualTo("aa,bb"); - } finally { - for (ColumnFamilyHandle handle : columnFamilyHandleList) { - handle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + try (final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions() + .setMergeOperatorName("stringappend"); + final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions() + .setMergeOperatorName("stringappend") + ) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt2) + ); + + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions opt = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList)) { + try { + // writing aa under key + db.put(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "aa".getBytes()); + // merge bb under key + db.merge(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "bb".getBytes()); + + byte[] value = db.get(columnFamilyHandleList.get(1), + "cfkey".getBytes()); + String strValue = new String(value); + assertThat(strValue).isEqualTo("aa,bb"); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandleList) { + handle.close(); + } + } } } } @@ -106,99 +89,85 @@ public void cFStringOption() @Test public void operatorOption() throws InterruptedException, RocksDBException { - RocksDB db = null; - Options opt = null; - try { - String db_path_string = - dbFolder.getRoot().getAbsolutePath(); - opt = new Options(); - opt.setCreateIfMissing(true); - - StringAppendOperator stringAppendOperator = new StringAppendOperator(); - opt.setMergeOperator(stringAppendOperator); - - db = RocksDB.open(opt, db_path_string); + final StringAppendOperator stringAppendOperator = + new StringAppendOperator(); + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setMergeOperator(stringAppendOperator); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // Writing aa under key db.put("key".getBytes(), "aa".getBytes()); // Writing bb under key db.merge("key".getBytes(), "bb".getBytes()); - byte[] value = db.get("key".getBytes()); - String strValue = new String(value); + final byte[] value = db.get("key".getBytes()); + final String strValue = new String(value); assertThat(strValue).isEqualTo("aa,bb"); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void cFOperatorOption() throws InterruptedException, RocksDBException { - RocksDB db = null; - DBOptions opt = null; - ColumnFamilyHandle cfHandle = null; - List cfDescriptors = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - try { - String db_path_string = - dbFolder.getRoot().getAbsolutePath(); - opt = new DBOptions(); - opt.setCreateIfMissing(true); - opt.setCreateMissingColumnFamilies(true); - StringAppendOperator stringAppendOperator = new StringAppendOperator(); - - cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions().setMergeOperator( - stringAppendOperator))); - cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(), - new ColumnFamilyOptions().setMergeOperator( - stringAppendOperator))); - db = RocksDB.open(opt, db_path_string, - cfDescriptors, columnFamilyHandleList); - - // writing aa under key - db.put(columnFamilyHandleList.get(1), - "cfkey".getBytes(), "aa".getBytes()); - // merge bb under key - db.merge(columnFamilyHandleList.get(1), - "cfkey".getBytes(), "bb".getBytes()); - byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); - String strValue = new String(value); - - // Test also with createColumnFamily - cfHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf2".getBytes(), - new ColumnFamilyOptions().setMergeOperator(stringAppendOperator))); - // writing xx under cfkey2 - db.put(cfHandle, "cfkey2".getBytes(), "xx".getBytes()); - // merge yy under cfkey2 - db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(), "yy".getBytes()); - value = db.get(cfHandle, "cfkey2".getBytes()); - String strValueTmpCf = new String(value); - - assertThat(strValue).isEqualTo("aa,bb"); - assertThat(strValueTmpCf).isEqualTo("xx,yy"); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (cfHandle != null) { - cfHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + final StringAppendOperator stringAppendOperator = + new StringAppendOperator(); + try (final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions() + .setMergeOperator(stringAppendOperator); + final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions() + .setMergeOperator(stringAppendOperator) + ) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1), + new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2) + ); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions opt = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + columnFamilyHandleList) + ) { + try { + // writing aa under key + db.put(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "aa".getBytes()); + // merge bb under key + db.merge(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "bb".getBytes()); + byte[] value = db.get(columnFamilyHandleList.get(1), + "cfkey".getBytes()); + String strValue = new String(value); + + // Test also with createColumnFamily + try (final ColumnFamilyOptions cfHandleOpts = + new ColumnFamilyOptions() + .setMergeOperator(stringAppendOperator); + final ColumnFamilyHandle cfHandle = + db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf2".getBytes(), + cfHandleOpts)) + ) { + // writing xx under cfkey2 + db.put(cfHandle, "cfkey2".getBytes(), "xx".getBytes()); + // merge yy under cfkey2 + db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(), + "yy".getBytes()); + value = db.get(cfHandle, "cfkey2".getBytes()); + String strValueTmpCf = new String(value); + + assertThat(strValue).isEqualTo("aa,bb"); + assertThat(strValueTmpCf).isEqualTo("xx,yy"); + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @@ -206,97 +175,67 @@ public void cFOperatorOption() @Test public void operatorGcBehaviour() throws RocksDBException { - Options opt = null; - RocksDB db = null; - try { - String db_path_string = - dbFolder.getRoot().getAbsolutePath(); - opt = new Options(); - opt.setCreateIfMissing(true); - StringAppendOperator stringAppendOperator = new StringAppendOperator(); - opt.setMergeOperator(stringAppendOperator); - db = RocksDB.open(opt, db_path_string); - db.close(); - opt.dispose(); - System.gc(); - System.runFinalization(); - // test reuse - opt = new Options(); - opt.setMergeOperator(stringAppendOperator); - db = RocksDB.open(opt, db_path_string); - db.close(); - opt.dispose(); - System.gc(); - System.runFinalization(); - // test param init - opt = new Options(); - opt.setMergeOperator(new StringAppendOperator()); - db = RocksDB.open(opt, db_path_string); - db.close(); - opt.dispose(); - System.gc(); - System.runFinalization(); - // test replace one with another merge operator instance - opt = new Options(); - opt.setMergeOperator(stringAppendOperator); - StringAppendOperator newStringAppendOperator = new StringAppendOperator(); + final StringAppendOperator stringAppendOperator + = new StringAppendOperator(); + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setMergeOperator(stringAppendOperator); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } + + // test reuse + try (final Options opt = new Options() + .setMergeOperator(stringAppendOperator); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } + + // test param init + try (final Options opt = new Options() + .setMergeOperator(new StringAppendOperator()); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } + + // test replace one with another merge operator instance + try (final Options opt = new Options() + .setMergeOperator(stringAppendOperator)) { + final StringAppendOperator newStringAppendOperator + = new StringAppendOperator(); opt.setMergeOperator(newStringAppendOperator); - db = RocksDB.open(opt, db_path_string); - db.close(); - opt.dispose(); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + try (final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + //no-op } } } @Test public void emptyStringInSetMergeOperatorByName() { - Options opt = null; - ColumnFamilyOptions cOpt = null; - try { - opt = new Options(); - cOpt = new ColumnFamilyOptions(); - opt.setMergeOperatorName(""); - cOpt.setMergeOperatorName(""); - } finally { - if (opt != null) { - opt.dispose(); - } - if (cOpt != null) { - cOpt.dispose(); - } + try (final Options opt = new Options() + .setMergeOperatorName(""); + final ColumnFamilyOptions cOpt = new ColumnFamilyOptions() + .setMergeOperatorName("")) { + //no-op } } @Test(expected = IllegalArgumentException.class) public void nullStringInSetMergeOperatorByNameOptions() { - Options opt = null; - try { - opt = new Options(); + try (final Options opt = new Options()) { opt.setMergeOperatorName(null); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test(expected = IllegalArgumentException.class) public void - nullStringInSetMergeOperatorByNameColumnFamilyOptions() { - ColumnFamilyOptions opt = null; - try { - opt = new ColumnFamilyOptions(); + nullStringInSetMergeOperatorByNameColumnFamilyOptions() { + try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) { opt.setMergeOperatorName(null); - } finally { - if (opt != null) { - opt.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java index f095e99d8d..bbe2957197 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,38 +19,37 @@ public class MixedOptionsTest { @Test public void mixedOptionsTest(){ // Set a table factory and check the names - ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(); - cfOptions.setTableFormatConfig(new BlockBasedTableConfig(). - setFilter(new BloomFilter())); - assertThat(cfOptions.tableFactoryName()).isEqualTo( - "BlockBasedTable"); - cfOptions.setTableFormatConfig(new PlainTableConfig()); - assertThat(cfOptions.tableFactoryName()).isEqualTo("PlainTable"); - // Initialize a dbOptions object from cf options and - // db options - DBOptions dbOptions = new DBOptions(); - Options options = new Options(dbOptions, cfOptions); - assertThat(options.tableFactoryName()).isEqualTo("PlainTable"); - // Free instances - options.dispose(); - options = null; - cfOptions.dispose(); - cfOptions = null; - dbOptions.dispose(); - dbOptions = null; - System.gc(); - System.runFinalization(); + try(final Filter bloomFilter = new BloomFilter(); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions() + .setTableFormatConfig( + new BlockBasedTableConfig().setFilter(bloomFilter)) + ) { + assertThat(cfOptions.tableFactoryName()).isEqualTo( + "BlockBasedTable"); + cfOptions.setTableFormatConfig(new PlainTableConfig()); + assertThat(cfOptions.tableFactoryName()).isEqualTo("PlainTable"); + // Initialize a dbOptions object from cf options and + // db options + try (final DBOptions dbOptions = new DBOptions(); + final Options options = new Options(dbOptions, cfOptions)) { + assertThat(options.tableFactoryName()).isEqualTo("PlainTable"); + // Free instances + } + } + // Test Optimize for statements - cfOptions = new ColumnFamilyOptions(); + try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { cfOptions.optimizeUniversalStyleCompaction(); cfOptions.optimizeLevelStyleCompaction(); cfOptions.optimizeForPointLookup(1024); - options = new Options(); - options.optimizeLevelStyleCompaction(); - options.optimizeLevelStyleCompaction(400); - options.optimizeUniversalStyleCompaction(); - options.optimizeUniversalStyleCompaction(400); - options.optimizeForPointLookup(1024); - options.prepareForBulkLoad(); + try(final Options options = new Options()) { + options.optimizeLevelStyleCompaction(); + options.optimizeLevelStyleCompaction(400); + options.optimizeUniversalStyleCompaction(); + options.optimizeUniversalStyleCompaction(400); + options.optimizeForPointLookup(1024); + options.prepareForBulkLoad(); + } + } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java index 7d9322a534..186108ffb6 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,6 +9,7 @@ import org.junit.rules.TemporaryFolder; import org.rocksdb.util.Environment; +import java.io.File; import java.io.IOException; import java.nio.file.*; @@ -21,11 +22,20 @@ public class NativeLibraryLoaderTest { @Test public void tempFolder() throws IOException { - NativeLibraryLoader.getInstance().loadLibraryFromJar( + NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( temporaryFolder.getRoot().getAbsolutePath()); - Path path = Paths.get(temporaryFolder.getRoot().getAbsolutePath(), + final Path path = Paths.get(temporaryFolder.getRoot().getAbsolutePath(), Environment.getJniLibraryFileName("rocksdb")); assertThat(Files.exists(path)).isTrue(); assertThat(Files.isReadable(path)).isTrue(); } + + @Test + public void overridesExistingLibrary() throws IOException { + File first = NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( + temporaryFolder.getRoot().getAbsolutePath()); + NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( + temporaryFolder.getRoot().getAbsolutePath()); + assertThat(first.exists()).isTrue(); + } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java index 1c1dfc63a3..87e2040b95 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,6 +8,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; + import org.junit.ClassRule; import org.junit.Test; @@ -25,985 +26,581 @@ public class OptionsTest { @Test public void setIncreaseParallelism() { - Options opt = null; - try { - opt = new Options(); + try (final Options opt = new Options()) { final int threads = Runtime.getRuntime().availableProcessors() * 2; opt.setIncreaseParallelism(threads); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void writeBufferSize() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setWriteBufferSize(longValue); assertThat(opt.writeBufferSize()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxWriteBufferNumber() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMaxWriteBufferNumber(intValue); assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void minWriteBufferNumberToMerge() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMinWriteBufferNumberToMerge(intValue); assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void numLevels() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setNumLevels(intValue); assertThat(opt.numLevels()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelZeroFileNumCompactionTrigger() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setLevelZeroFileNumCompactionTrigger(intValue); assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelZeroSlowdownWritesTrigger() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setLevelZeroSlowdownWritesTrigger(intValue); assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelZeroStopWritesTrigger() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setLevelZeroStopWritesTrigger(intValue); assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void targetFileSizeBase() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setTargetFileSizeBase(longValue); assertThat(opt.targetFileSizeBase()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void targetFileSizeMultiplier() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setTargetFileSizeMultiplier(intValue); assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxBytesForLevelBase() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setMaxBytesForLevelBase(longValue); assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void levelCompactionDynamicLevelBytes() { - Options opt = null; - try { - opt = new Options(); + try (final Options opt = new Options()) { final boolean boolValue = rand.nextBoolean(); opt.setLevelCompactionDynamicLevelBytes(boolValue); assertThat(opt.levelCompactionDynamicLevelBytes()) .isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxBytesForLevelMultiplier() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMaxBytesForLevelMultiplier(intValue); assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void expandedCompactionFactor() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setExpandedCompactionFactor(intValue); assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void sourceCompactionFactor() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setSourceCompactionFactor(intValue); assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxGrandparentOverlapFactor() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMaxGrandparentOverlapFactor(intValue); assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void softRateLimit() { - Options opt = null; - try { - opt = new Options(); - double doubleValue = rand.nextDouble(); + try (final Options opt = new Options()) { + final double doubleValue = rand.nextDouble(); opt.setSoftRateLimit(doubleValue); assertThat(opt.softRateLimit()).isEqualTo(doubleValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void hardRateLimit() { - Options opt = null; - try { - opt = new Options(); - double doubleValue = rand.nextDouble(); + try (final Options opt = new Options()) { + final double doubleValue = rand.nextDouble(); opt.setHardRateLimit(doubleValue); assertThat(opt.hardRateLimit()).isEqualTo(doubleValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void rateLimitDelayMaxMilliseconds() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setRateLimitDelayMaxMilliseconds(intValue); assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void arenaBlockSize() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setArenaBlockSize(longValue); assertThat(opt.arenaBlockSize()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void disableAutoCompactions() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setDisableAutoCompactions(boolValue); assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void purgeRedundantKvsWhileFlush() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setPurgeRedundantKvsWhileFlush(boolValue); assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void verifyChecksumsInCompaction() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setVerifyChecksumsInCompaction(boolValue); assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void filterDeletes() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); - opt.setFilterDeletes(boolValue); - assertThat(opt.filterDeletes()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxSequentialSkipInIterations() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setMaxSequentialSkipInIterations(longValue); assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void inplaceUpdateSupport() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setInplaceUpdateSupport(boolValue); assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void inplaceUpdateNumLocks() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setInplaceUpdateNumLocks(longValue); assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } - } - } - - @Test - public void memtablePrefixBloomBits() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); - opt.setMemtablePrefixBloomBits(intValue); - assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test - public void memtablePrefixBloomProbes() { - Options opt = null; - try { - int intValue = rand.nextInt(); - opt = new Options(); - opt.setMemtablePrefixBloomProbes(intValue); - assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } + public void memtablePrefixBloomSizeRatio() { + try (final Options opt = new Options()) { + final double doubleValue = rand.nextDouble(); + opt.setMemtablePrefixBloomSizeRatio(doubleValue); + assertThat(opt.memtablePrefixBloomSizeRatio()).isEqualTo(doubleValue); } } @Test public void bloomLocality() { - Options opt = null; - try { - int intValue = rand.nextInt(); - opt = new Options(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setBloomLocality(intValue); assertThat(opt.bloomLocality()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxSuccessiveMerges() throws RocksDBException { - Options opt = null; - try { - long longValue = rand.nextLong(); - opt = new Options(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setMaxSuccessiveMerges(longValue); assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void minPartialMergeOperands() { - Options opt = null; - try { - int intValue = rand.nextInt(); - opt = new Options(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMinPartialMergeOperands(intValue); assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void optimizeFiltersForHits() { - Options opt = null; - try { - boolean aBoolean = rand.nextBoolean(); - opt = new Options(); + try (final Options opt = new Options()) { + final boolean aBoolean = rand.nextBoolean(); opt.setOptimizeFiltersForHits(aBoolean); assertThat(opt.optimizeFiltersForHits()).isEqualTo(aBoolean); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void createIfMissing() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setCreateIfMissing(boolValue); assertThat(opt.createIfMissing()). isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void createMissingColumnFamilies() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setCreateMissingColumnFamilies(boolValue); assertThat(opt.createMissingColumnFamilies()). isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void errorIfExists() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setErrorIfExists(boolValue); assertThat(opt.errorIfExists()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void paranoidChecks() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setParanoidChecks(boolValue); assertThat(opt.paranoidChecks()). isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxTotalWalSize() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setMaxTotalWalSize(longValue); assertThat(opt.maxTotalWalSize()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxOpenFiles() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMaxOpenFiles(intValue); assertThat(opt.maxOpenFiles()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void disableDataSync() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setDisableDataSync(boolValue); assertThat(opt.disableDataSync()). isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void useFsync() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setUseFsync(boolValue); assertThat(opt.useFsync()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void dbLogDir() { - Options opt = null; - try { - opt = new Options(); - String str = "path/to/DbLogDir"; + try (final Options opt = new Options()) { + final String str = "path/to/DbLogDir"; opt.setDbLogDir(str); assertThat(opt.dbLogDir()).isEqualTo(str); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void walDir() { - Options opt = null; - try { - opt = new Options(); - String str = "path/to/WalDir"; + try (final Options opt = new Options()) { + final String str = "path/to/WalDir"; opt.setWalDir(str); assertThat(opt.walDir()).isEqualTo(str); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void deleteObsoleteFilesPeriodMicros() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setDeleteObsoleteFilesPeriodMicros(longValue); assertThat(opt.deleteObsoleteFilesPeriodMicros()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxBackgroundCompactions() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMaxBackgroundCompactions(intValue); assertThat(opt.maxBackgroundCompactions()). isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxBackgroundFlushes() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setMaxBackgroundFlushes(intValue); assertThat(opt.maxBackgroundFlushes()). isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxLogFileSize() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setMaxLogFileSize(longValue); assertThat(opt.maxLogFileSize()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void logFileTimeToRoll() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setLogFileTimeToRoll(longValue); assertThat(opt.logFileTimeToRoll()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void keepLogFileNum() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setKeepLogFileNum(longValue); assertThat(opt.keepLogFileNum()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void maxManifestFileSize() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setMaxManifestFileSize(longValue); assertThat(opt.maxManifestFileSize()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void tableCacheNumshardbits() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setTableCacheNumshardbits(intValue); assertThat(opt.tableCacheNumshardbits()). isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void walSizeLimitMB() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setWalSizeLimitMB(longValue); assertThat(opt.walSizeLimitMB()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void walTtlSeconds() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setWalTtlSeconds(longValue); assertThat(opt.walTtlSeconds()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void manifestPreallocationSize() throws RocksDBException { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setManifestPreallocationSize(longValue); assertThat(opt.manifestPreallocationSize()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void allowOsBuffer() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setAllowOsBuffer(boolValue); assertThat(opt.allowOsBuffer()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void allowMmapReads() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setAllowMmapReads(boolValue); assertThat(opt.allowMmapReads()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void allowMmapWrites() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setAllowMmapWrites(boolValue); assertThat(opt.allowMmapWrites()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void isFdCloseOnExec() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setIsFdCloseOnExec(boolValue); assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void statsDumpPeriodSec() { - Options opt = null; - try { - opt = new Options(); - int intValue = rand.nextInt(); + try (final Options opt = new Options()) { + final int intValue = rand.nextInt(); opt.setStatsDumpPeriodSec(intValue); assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void adviseRandomOnOpen() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setAdviseRandomOnOpen(boolValue); assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void useAdaptiveMutex() { - Options opt = null; - try { - opt = new Options(); - boolean boolValue = rand.nextBoolean(); + try (final Options opt = new Options()) { + final boolean boolValue = rand.nextBoolean(); opt.setUseAdaptiveMutex(boolValue); assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void bytesPerSync() { - Options opt = null; - try { - opt = new Options(); - long longValue = rand.nextLong(); + try (final Options opt = new Options()) { + final long longValue = rand.nextLong(); opt.setBytesPerSync(longValue); assertThat(opt.bytesPerSync()).isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void env() { - Options options = null; - try { - options = new Options(); - Env env = Env.getDefault(); + try (final Options options = new Options(); + final Env env = Env.getDefault()) { options.setEnv(env); assertThat(options.getEnv()).isSameAs(env); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void linkageOfPrepMethods() { - Options options = null; - try { - options = new Options(); + try (final Options options = new Options()) { options.optimizeUniversalStyleCompaction(); options.optimizeUniversalStyleCompaction(4000); options.optimizeLevelStyleCompaction(); options.optimizeLevelStyleCompaction(3000); options.optimizeForPointLookup(10); options.prepareForBulkLoad(); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void compressionTypes() { - Options options = null; - try { - options = new Options(); - for (CompressionType compressionType : + try (final Options options = new Options()) { + for (final CompressionType compressionType : CompressionType.values()) { options.setCompressionType(compressionType); assertThat(options.compressionType()). @@ -1011,22 +608,17 @@ public void compressionTypes() { assertThat(CompressionType.valueOf("NO_COMPRESSION")). isEqualTo(CompressionType.NO_COMPRESSION); } - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void compressionPerLevel() { - ColumnFamilyOptions columnFamilyOptions = null; - try { - columnFamilyOptions = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions columnFamilyOptions = + new ColumnFamilyOptions()) { assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty(); List compressionTypeList = new ArrayList<>(); - for (int i=0; i < columnFamilyOptions.numLevels(); i++) { + for (int i = 0; i < columnFamilyOptions.numLevels(); i++) { compressionTypeList.add(CompressionType.NO_COMPRESSION); } columnFamilyOptions.setCompressionPerLevel(compressionTypeList); @@ -1035,18 +627,13 @@ public void compressionPerLevel() { assertThat(compressionType).isEqualTo( CompressionType.NO_COMPRESSION); } - } finally { - if (columnFamilyOptions != null) { - columnFamilyOptions.dispose(); - } } } @Test public void differentCompressionsPerLevel() { - ColumnFamilyOptions columnFamilyOptions = null; - try { - columnFamilyOptions = new ColumnFamilyOptions(); + try (final ColumnFamilyOptions columnFamilyOptions = + new ColumnFamilyOptions()) { columnFamilyOptions.setNumLevels(3); assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty(); @@ -1066,19 +653,13 @@ public void differentCompressionsPerLevel() { CompressionType.SNAPPY_COMPRESSION, CompressionType.LZ4_COMPRESSION); - } finally { - if (columnFamilyOptions != null) { - columnFamilyOptions.dispose(); - } } } @Test public void compactionStyles() { - Options options = null; - try { - options = new Options(); - for (CompactionStyle compactionStyle : + try (final Options options = new Options()) { + for (final CompactionStyle compactionStyle : CompactionStyle.values()) { options.setCompactionStyle(compactionStyle); assertThat(options.compactionStyle()). @@ -1086,18 +667,12 @@ public void compactionStyles() { assertThat(CompactionStyle.valueOf("FIFO")). isEqualTo(CompactionStyle.FIFO); } - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void maxTableFilesSizeFIFO() { - Options opt = null; - try { - opt = new Options(); + try (final Options opt = new Options()) { long longValue = rand.nextLong(); // Size has to be positive longValue = (longValue < 0) ? -longValue : longValue; @@ -1105,61 +680,36 @@ public void maxTableFilesSizeFIFO() { opt.setMaxTableFilesSizeFIFO(longValue); assertThat(opt.maxTableFilesSizeFIFO()). isEqualTo(longValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test public void rateLimiterConfig() { - Options options = null; - Options anotherOptions = null; - RateLimiterConfig rateLimiterConfig; - try { - options = new Options(); - rateLimiterConfig = new GenericRateLimiterConfig(1000, 100 * 1000, 1); + try (final Options options = new Options(); + final Options anotherOptions = new Options()) { + final RateLimiterConfig rateLimiterConfig = + new GenericRateLimiterConfig(1000, 100 * 1000, 1); options.setRateLimiterConfig(rateLimiterConfig); // Test with parameter initialization - anotherOptions = new Options(); + anotherOptions.setRateLimiterConfig( new GenericRateLimiterConfig(1000)); - } finally { - if (options != null) { - options.dispose(); - } - if (anotherOptions != null) { - anotherOptions.dispose(); - } } } @Test public void shouldSetTestPrefixExtractor() { - Options options = null; - try { - options = new Options(); + try (final Options options = new Options()) { options.useFixedLengthPrefixExtractor(100); options.useFixedLengthPrefixExtractor(10); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void shouldSetTestCappedPrefixExtractor() { - Options options = null; - try { - options = new Options(); + try (final Options options = new Options()) { options.useCappedPrefixExtractor(100); options.useCappedPrefixExtractor(10); - } finally { - if (options != null) { - options.dispose(); - } } } @@ -1167,9 +717,7 @@ public void shouldSetTestCappedPrefixExtractor() { @Test public void shouldTestMemTableFactoryName() throws RocksDBException { - Options options = null; - try { - options = new Options(); + try (final Options options = new Options()) { options.setMemTableConfig(new VectorMemTableConfig()); assertThat(options.memTableFactoryName()). isEqualTo("VectorRepFactory"); @@ -1177,31 +725,18 @@ public void shouldTestMemTableFactoryName() new HashLinkedListMemTableConfig()); assertThat(options.memTableFactoryName()). isEqualTo("HashLinkedListRepFactory"); - } finally { - if (options != null) { - options.dispose(); - } } } @Test public void statistics() { - Options options = null; - Options anotherOptions = null; - try { - options = new Options(); + try (final Options options = new Options()) { Statistics statistics = options.createStatistics(). statisticsPtr(); assertThat(statistics).isNotNull(); - anotherOptions = new Options(); - statistics = anotherOptions.statisticsPtr(); - assertThat(statistics).isNotNull(); - } finally { - if (options != null) { - options.dispose(); - } - if (anotherOptions != null) { - anotherOptions.dispose(); + try (final Options anotherOptions = new Options()) { + statistics = anotherOptions.statisticsPtr(); + assertThat(statistics).isNotNull(); } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java index 850b050a0a..05bd13863d 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -80,16 +80,10 @@ public void storeIndexInFile() { @Test public void plainTableConfig() { - Options opt = null; - try { - opt = new Options(); - PlainTableConfig plainTableConfig = new PlainTableConfig(); + try(final Options opt = new Options()) { + final PlainTableConfig plainTableConfig = new PlainTableConfig(); opt.setTableFormatConfig(plainTableConfig); assertThat(opt.tableFactoryName()).isEqualTo("PlainTable"); - } finally { - if (opt != null) { - opt.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java b/external/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java index 0155ce2639..b437e7f97b 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,7 +18,7 @@ public class PlatformRandomHelper { * @return boolean value indicating if operating system is 64 Bit. */ public static boolean isOs64Bit(){ - boolean is64Bit; + final boolean is64Bit; if (System.getProperty("os.name").contains("Windows")) { is64Bit = (System.getenv("ProgramFiles(x86)") != null); } else { diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java index 70ea75d156..d993c91484 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,6 +10,7 @@ import org.junit.rules.TemporaryFolder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; @@ -25,340 +26,279 @@ public class ReadOnlyTest { @Test public void readOnlyOpen() throws RocksDBException { - RocksDB db = null; - RocksDB db2 = null; - RocksDB db3 = null; - Options options = null; - List columnFamilyHandleList = - new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - List readOnlyColumnFamilyHandleList2 = - new ArrayList<>(); - try { - options = new Options(); - options.setCreateIfMissing(true); - - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { db.put("key".getBytes(), "value".getBytes()); - db2 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath()); - assertThat("value"). - isEqualTo(new String(db2.get("key".getBytes()))); - db.close(); - db2.close(); - - List cfDescriptors = new ArrayList<>(); - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); - - db = RocksDB.open( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList); - columnFamilyHandleList.add(db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions()))); - columnFamilyHandleList.add(db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions()))); - db.put(columnFamilyHandleList.get(2), "key2".getBytes(), - "value2".getBytes()); - - db2 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList); - assertThat(db2.get("key2".getBytes())).isNull(); - assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), "key2".getBytes())). - isNull(); - cfDescriptors.clear(); - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); - cfDescriptors.add( - new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions())); - db3 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList2); - assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1), - "key2".getBytes()))).isEqualTo("value2"); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); + try (final RocksDB db2 = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath())) { + assertThat("value"). + isEqualTo(new String(db2.get("key".getBytes()))); } - if (db2 != null) { - db2.close(); - } - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList2) { - columnFamilyHandle.dispose(); - } - if (db3 != null) { - db3.close(); - } - if (options != null) { - options.dispose(); + } + + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + + final List columnFamilyHandleList = new ArrayList<>(); + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + try (final ColumnFamilyOptions newCfOpts = new ColumnFamilyOptions(); + final ColumnFamilyOptions newCf2Opts = new ColumnFamilyOptions() + ) { + columnFamilyHandleList.add(db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf".getBytes(), newCfOpts))); + columnFamilyHandleList.add(db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf2".getBytes(), newCf2Opts))); + db.put(columnFamilyHandleList.get(2), "key2".getBytes(), + "value2".getBytes()); + + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + try (final RocksDB db2 = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList)) { + try (final ColumnFamilyOptions newCfOpts2 = + new ColumnFamilyOptions(); + final ColumnFamilyOptions newCf2Opts2 = + new ColumnFamilyOptions() + ) { + assertThat(db2.get("key2".getBytes())).isNull(); + assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), + "key2".getBytes())). + isNull(); + cfDescriptors.clear(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + newCfOpts2)); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), + newCf2Opts2)); + + final List readOnlyColumnFamilyHandleList2 + = new ArrayList<>(); + try (final RocksDB db3 = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList2)) { + try { + assertThat(new String(db3.get( + readOnlyColumnFamilyHandleList2.get(1), + "key2".getBytes()))).isEqualTo("value2"); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList2) { + columnFamilyHandle.close(); + } + } + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test(expected = RocksDBException.class) public void failToWriteInReadOnly() throws RocksDBException { - RocksDB db = null; - RocksDB rDb = null; - Options options = null; - List cfDescriptors = new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try { + try (final Options options = new Options() + .setCreateIfMissing(true)) { - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } + } - options = new Options(); - options.setCreateIfMissing(true); + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) + ); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.close(); - rDb = RocksDB.openReadOnly( + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList); - - // test that put fails in readonly mode - rDb.put("key".getBytes(), "value".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (rDb != null) { - rDb.close(); - } - if (options != null) { - options.dispose(); + readOnlyColumnFamilyHandleList)) { + try { + // test that put fails in readonly mode + rDb.put("key".getBytes(), "value".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test(expected = RocksDBException.class) public void failToCFWriteInReadOnly() throws RocksDBException { - RocksDB db = null; - RocksDB rDb = null; - Options options = null; - List cfDescriptors = new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try { - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); - - options = new Options(); - options.setCreateIfMissing(true); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.close(); - rDb = RocksDB.openReadOnly( + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) + ); + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList); - - rDb.put(readOnlyColumnFamilyHandleList.get(0), - "key".getBytes(), "value".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (rDb != null) { - rDb.close(); - } - if (options != null) { - options.dispose(); + readOnlyColumnFamilyHandleList)) { + try { + rDb.put(readOnlyColumnFamilyHandleList.get(0), + "key".getBytes(), "value".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test(expected = RocksDBException.class) public void failToRemoveInReadOnly() throws RocksDBException { - RocksDB db = null; - RocksDB rDb = null; - Options options = null; - List cfDescriptors = new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try { - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } - options = new Options(); - options.setCreateIfMissing(true); + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) + ); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.close(); - rDb = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList); + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); - rDb.remove("key".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (rDb != null) { - rDb.close(); - } - if (options != null) { - options.dispose(); + try (final RocksDB rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList)) { + try { + rDb.remove("key".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test(expected = RocksDBException.class) public void failToCFRemoveInReadOnly() throws RocksDBException { - RocksDB db = null; - RocksDB rDb = null; - Options options = null; - List cfDescriptors = new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try { - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); - - options = new Options(); - options.setCreateIfMissing(true); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.close(); + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) + ); - rDb = RocksDB.openReadOnly( + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList); - - rDb.remove(readOnlyColumnFamilyHandleList.get(0), - "key".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (rDb != null) { - rDb.close(); - } - if (options != null) { - options.dispose(); + readOnlyColumnFamilyHandleList)) { + try { + rDb.remove(readOnlyColumnFamilyHandleList.get(0), + "key".getBytes()); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test(expected = RocksDBException.class) public void failToWriteBatchReadOnly() throws RocksDBException { - RocksDB db = null; - RocksDB rDb = null; - Options options = null; - List cfDescriptors = new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try { - - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); - - options = new Options(); - options.setCreateIfMissing(true); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.close(); + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) + ); - rDb = RocksDB.openReadOnly( + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList); - - WriteBatch wb = new WriteBatch(); - wb.put("key".getBytes(), "value".getBytes()); - rDb.write(new WriteOptions(), wb); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (rDb != null) { - rDb.close(); - } - if (options != null) { - options.dispose(); + final WriteBatch wb = new WriteBatch(); + final WriteOptions wOpts = new WriteOptions()) { + try { + wb.put("key".getBytes(), "value".getBytes()); + rDb.write(wOpts, wb); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } @Test(expected = RocksDBException.class) public void failToCFWriteBatchReadOnly() throws RocksDBException { - RocksDB db = null; - RocksDB rDb = null; - Options options = null; - WriteBatch wb = null; - List cfDescriptors = new ArrayList<>(); - List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try { - - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - new ColumnFamilyOptions())); - - - options = new Options(); - options.setCreateIfMissing(true); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + //no-op + } - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - db.close(); + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) + ); - rDb = RocksDB.openReadOnly( + final List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList); - - wb = new WriteBatch(); - wb.put(readOnlyColumnFamilyHandleList.get(0), - "key".getBytes(), "value".getBytes()); - rDb.write(new WriteOptions(), wb); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (rDb != null) { - rDb.close(); - } - if (options != null) { - options.dispose(); - } - if (wb != null) { - wb.dispose(); + final WriteBatch wb = new WriteBatch(); + final WriteOptions wOpts = new WriteOptions()) { + try { + wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), + "value".getBytes()); + rDb.write(wOpts, wb); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + readOnlyColumnFamilyHandleList) { + columnFamilyHandle.close(); + } + } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java index af88ce3519..58ed2ecc6f 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,127 +24,151 @@ public class ReadOptionsTest { public ExpectedException exception = ExpectedException.none(); @Test - public void verifyChecksum(){ - ReadOptions opt = null; - try { - opt = new ReadOptions(); - Random rand = new Random(); - boolean boolValue = rand.nextBoolean(); + public void verifyChecksum() { + try (final ReadOptions opt = new ReadOptions()) { + final Random rand = new Random(); + final boolean boolValue = rand.nextBoolean(); opt.setVerifyChecksums(boolValue); assertThat(opt.verifyChecksums()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test - public void fillCache(){ - ReadOptions opt = null; - try { - opt = new ReadOptions(); - Random rand = new Random(); - boolean boolValue = rand.nextBoolean(); + public void fillCache() { + try (final ReadOptions opt = new ReadOptions()) { + final Random rand = new Random(); + final boolean boolValue = rand.nextBoolean(); opt.setFillCache(boolValue); assertThat(opt.fillCache()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test - public void tailing(){ - ReadOptions opt = null; - try { - opt = new ReadOptions(); - Random rand = new Random(); - boolean boolValue = rand.nextBoolean(); + public void tailing() { + try (final ReadOptions opt = new ReadOptions()) { + final Random rand = new Random(); + final boolean boolValue = rand.nextBoolean(); opt.setTailing(boolValue); assertThat(opt.tailing()).isEqualTo(boolValue); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test - public void snapshot(){ - ReadOptions opt = null; - try { - opt = new ReadOptions(); + public void snapshot() { + try (final ReadOptions opt = new ReadOptions()) { opt.setSnapshot(null); assertThat(opt.snapshot()).isNull(); - } finally { - if (opt != null) { - opt.dispose(); - } } } @Test - public void failSetVerifyChecksumUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.setVerifyChecksums(true); + public void readTier() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setReadTier(ReadTier.BLOCK_CACHE_TIER); + assertThat(opt.readTier()).isEqualTo(ReadTier.BLOCK_CACHE_TIER); + } + } + + @Test + public void managed() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setManaged(true); + assertThat(opt.managed()).isTrue(); + } + } + + @Test + public void totalOrderSeek() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setTotalOrderSeek(true); + assertThat(opt.totalOrderSeek()).isTrue(); + } + } + + @Test + public void prefixSameAsStart() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setPrefixSameAsStart(true); + assertThat(opt.prefixSameAsStart()).isTrue(); + } + } + + @Test + public void pinData() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setPinData(true); + assertThat(opt.pinData()).isTrue(); + } + } + + @Test + public void failSetVerifyChecksumUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.setVerifyChecksums(true); + } } @Test - public void failVerifyChecksumUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.verifyChecksums(); + public void failVerifyChecksumUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.verifyChecksums(); + } } @Test - public void failSetFillCacheUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.setFillCache(true); + public void failSetFillCacheUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.setFillCache(true); + } } @Test - public void failFillCacheUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.fillCache(); + public void failFillCacheUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.fillCache(); + } } @Test - public void failSetTailingUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.setTailing(true); + public void failSetTailingUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.setTailing(true); + } } @Test - public void failTailingUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.tailing(); + public void failTailingUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.tailing(); + } } @Test - public void failSetSnapshotUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.setSnapshot(null); + public void failSetSnapshotUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.setSnapshot(null); + } } @Test - public void failSnapshotUninitialized(){ - ReadOptions readOptions = setupUninitializedReadOptions( - exception); - readOptions.snapshot(); + public void failSnapshotUninitialized() { + try (final ReadOptions readOptions = + setupUninitializedReadOptions(exception)) { + readOptions.snapshot(); + } } private ReadOptions setupUninitializedReadOptions( ExpectedException exception) { - ReadOptions readOptions = new ReadOptions(); - readOptions.dispose(); + final ReadOptions readOptions = new ReadOptions(); + readOptions.close(); exception.expect(AssertionError.class); return readOptions; } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java index 31d2c52384..2a31d826f3 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,10 +9,7 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Random; +import java.util.*; import static org.assertj.core.api.Assertions.assertThat; @@ -30,96 +27,65 @@ public class RocksDBTest { @Test public void open() throws RocksDBException { - RocksDB db = null; - Options opt = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); - db.close(); - opt = new Options(); - opt.setCreateIfMissing(true); - db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath()); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } + try (final RocksDB db = + RocksDB.open(dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + } + } + + @Test + public void open_opt() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); } } @Test public void put() throws RocksDBException { - RocksDB db = null; - WriteOptions opt = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + final WriteOptions opt = new WriteOptions()) { db.put("key1".getBytes(), "value".getBytes()); - opt = new WriteOptions(); db.put(opt, "key2".getBytes(), "12345678".getBytes()); assertThat(db.get("key1".getBytes())).isEqualTo( "value".getBytes()); assertThat(db.get("key2".getBytes())).isEqualTo( "12345678".getBytes()); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void write() throws RocksDBException { - RocksDB db = null; - Options options = null; - WriteBatch wb1 = null; - WriteBatch wb2 = null; - WriteOptions opts = null; - try { - options = new Options(). - setMergeOperator(new StringAppendOperator()). - setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - opts = new WriteOptions(); - wb1 = new WriteBatch(); - wb1.put("key1".getBytes(), "aa".getBytes()); - wb1.merge("key1".getBytes(), "bb".getBytes()); - wb2 = new WriteBatch(); - wb2.put("key2".getBytes(), "xx".getBytes()); - wb2.merge("key2".getBytes(), "yy".getBytes()); - db.write(opts, wb1); - db.write(opts, wb2); + try (final Options options = new Options().setMergeOperator( + new StringAppendOperator()).setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions opts = new WriteOptions()) { + + try (final WriteBatch wb1 = new WriteBatch()) { + wb1.put("key1".getBytes(), "aa".getBytes()); + wb1.merge("key1".getBytes(), "bb".getBytes()); + + try (final WriteBatch wb2 = new WriteBatch()) { + wb2.put("key2".getBytes(), "xx".getBytes()); + wb2.merge("key2".getBytes(), "yy".getBytes()); + db.write(opts, wb1); + db.write(opts, wb2); + } + } + assertThat(db.get("key1".getBytes())).isEqualTo( "aa,bb".getBytes()); assertThat(db.get("key2".getBytes())).isEqualTo( "xx,yy".getBytes()); - } finally { - if (db != null) { - db.close(); - } - if (wb1 != null) { - wb1.dispose(); - } - if (wb2 != null) { - wb2.dispose(); - } - if (options != null) { - options.dispose(); - } - if (opts != null) { - opts.dispose(); - } } } @Test public void getWithOutValue() throws RocksDBException { - RocksDB db = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + try (final RocksDB db = + RocksDB.open(dbFolder.getRoot().getAbsolutePath())) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); byte[] outValue = new byte[5]; @@ -134,20 +100,13 @@ public void getWithOutValue() throws RocksDBException { getResult = db.get("key2".getBytes(), outValue); assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); assertThat(outValue).isEqualTo("12345".getBytes()); - } finally { - if (db != null) { - db.close(); - } } } @Test public void getWithOutValueReadOptions() throws RocksDBException { - RocksDB db = null; - ReadOptions rOpt = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); - rOpt = new ReadOptions(); + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + final ReadOptions rOpt = new ReadOptions()) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); byte[] outValue = new byte[5]; @@ -163,29 +122,18 @@ public void getWithOutValueReadOptions() throws RocksDBException { getResult = db.get(rOpt, "key2".getBytes(), outValue); assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); assertThat(outValue).isEqualTo("12345".getBytes()); - } finally { - if (db != null) { - db.close(); - } - if (rOpt != null) { - rOpt.dispose(); - } } } @Test - public void multiGet() throws RocksDBException { - RocksDB db = null; - ReadOptions rOpt = null; - try { - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); - rOpt = new ReadOptions(); + public void multiGet() throws RocksDBException, InterruptedException { + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + final ReadOptions rOpt = new ReadOptions()) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); - List lookupKeys = new ArrayList() {{ - add("key1".getBytes()); - add("key2".getBytes()); - }}; + List lookupKeys = new ArrayList<>(); + lookupKeys.add("key1".getBytes()); + lookupKeys.add("key2".getBytes()); Map results = db.multiGet(lookupKeys); assertThat(results).isNotNull(); assertThat(results.values()).isNotNull(); @@ -213,27 +161,18 @@ public void multiGet() throws RocksDBException { assertThat(results.values()).isNotNull(); assertThat(results.values()). contains("value".getBytes()); - } finally { - if (db != null) { - db.close(); - } - if (rOpt != null) { - rOpt.dispose(); - } } } @Test public void merge() throws RocksDBException { - RocksDB db = null; - Options opt = null; - WriteOptions wOpt; - try { - opt = new Options(). - setCreateIfMissing(true). - setMergeOperator(new StringAppendOperator()); - wOpt = new WriteOptions(); - db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath()); + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setMergeOperator(new StringAppendOperator()); + final WriteOptions wOpt = new WriteOptions(); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()) + ) { db.put("key1".getBytes(), "value".getBytes()); assertThat(db.get("key1".getBytes())).isEqualTo( "value".getBytes()); @@ -249,23 +188,13 @@ public void merge() throws RocksDBException { db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes()); assertThat(db.get("key2".getBytes())).isEqualTo( "xxxx".getBytes()); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void remove() throws RocksDBException { - RocksDB db = null; - WriteOptions wOpt; - try { - wOpt = new WriteOptions(); - db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + final WriteOptions wOpt = new WriteOptions()) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); assertThat(db.get("key1".getBytes())).isEqualTo( @@ -276,66 +205,47 @@ public void remove() throws RocksDBException { db.remove(wOpt, "key2".getBytes()); assertThat(db.get("key1".getBytes())).isNull(); assertThat(db.get("key2".getBytes())).isNull(); - } finally { - if (db != null) { - db.close(); - } } } @Test public void getIntProperty() throws RocksDBException { - RocksDB db = null; - Options options = null; - WriteOptions wOpt = null; - try { - options = new Options(); - wOpt = new WriteOptions(); - // Setup options - options.setCreateIfMissing(true); - options.setMaxWriteBufferNumber(10); - options.setMinWriteBufferNumberToMerge(10); - wOpt.setDisableWAL(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try ( + final Options options = new Options() + .setCreateIfMissing(true) + .setMaxWriteBufferNumber(10) + .setMinWriteBufferNumberToMerge(10); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions wOpt = new WriteOptions().setDisableWAL(true) + ) { db.put(wOpt, "key1".getBytes(), "value1".getBytes()); db.put(wOpt, "key2".getBytes(), "value2".getBytes()); db.put(wOpt, "key3".getBytes(), "value3".getBytes()); db.put(wOpt, "key4".getBytes(), "value4".getBytes()); - assertThat(db.getLongProperty("rocksdb.num-entries-active-mem-table")).isGreaterThan(0); - assertThat(db.getLongProperty("rocksdb.cur-size-active-mem-table")).isGreaterThan(0); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (wOpt != null) { - wOpt.dispose(); - } + assertThat(db.getLongProperty("rocksdb.num-entries-active-mem-table")) + .isGreaterThan(0); + assertThat(db.getLongProperty("rocksdb.cur-size-active-mem-table")) + .isGreaterThan(0); } } @Test public void fullCompactRange() throws RocksDBException { - RocksDB db = null; - Options opt = null; - try { - opt = new Options(). - setCreateIfMissing(true). - setDisableAutoCompactions(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(4). - setWriteBufferSize(100<<10). - setLevelZeroFileNumCompactionTrigger(3). - setTargetFileSizeBase(200 << 10). - setTargetFileSizeMultiplier(1). - setMaxBytesForLevelBase(500 << 10). - setMaxBytesForLevelMultiplier(1). - setDisableAutoCompactions(false); - // open database - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + try (final Options opt = new Options(). + setCreateIfMissing(true). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { @@ -343,66 +253,53 @@ public void fullCompactRange() throws RocksDBException { db.put((String.valueOf(i)).getBytes(), b); } db.compactRange(); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void fullCompactRangeColumnFamily() throws RocksDBException { - RocksDB db = null; - DBOptions opt = null; - List columnFamilyHandles = - new ArrayList<>(); - try { - opt = new DBOptions(). - setCreateIfMissing(true). - setCreateMissingColumnFamilies(true); - List columnFamilyDescriptors = - new ArrayList<>(); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY)); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - "new_cf".getBytes(), - new ColumnFamilyOptions(). - setDisableAutoCompactions(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(4). - setWriteBufferSize(100 << 10). - setLevelZeroFileNumCompactionTrigger(3). - setTargetFileSizeBase(200 << 10). - setTargetFileSizeMultiplier(1). - setMaxBytesForLevelBase(500 << 10). - setMaxBytesForLevelMultiplier(1). - setDisableAutoCompactions(false))); + try ( + final DBOptions opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false) + ) { + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)); + // open database - db = RocksDB.open(opt, + final List columnFamilyHandles = new ArrayList<>(); + try (final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, - columnFamilyHandles); - // fill database with key/value pairs - byte[] b = new byte[10000]; - for (int i = 0; i < 200; i++) { - rand.nextBytes(b); - db.put(columnFamilyHandles.get(1), - String.valueOf(i).getBytes(), b); - } - db.compactRange(columnFamilyHandles.get(1)); - } finally { - for (ColumnFamilyHandle handle : columnFamilyHandles) { - handle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + columnFamilyHandles)) { + try { + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.compactRange(columnFamilyHandles.get(1)); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } + } } } } @@ -410,24 +307,20 @@ public void fullCompactRangeColumnFamily() @Test public void compactRangeWithKeys() throws RocksDBException { - RocksDB db = null; - Options opt = null; - try { - opt = new Options(). - setCreateIfMissing(true). - setDisableAutoCompactions(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(4). - setWriteBufferSize(100<<10). - setLevelZeroFileNumCompactionTrigger(3). - setTargetFileSizeBase(200 << 10). - setTargetFileSizeMultiplier(1). - setMaxBytesForLevelBase(500 << 10). - setMaxBytesForLevelMultiplier(1). - setDisableAutoCompactions(false); - // open database - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + try (final Options opt = new Options(). + setCreateIfMissing(true). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { @@ -435,37 +328,27 @@ public void compactRangeWithKeys() db.put((String.valueOf(i)).getBytes(), b); } db.compactRange("0".getBytes(), "201".getBytes()); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void compactRangeWithKeysReduce() throws RocksDBException { - RocksDB db = null; - Options opt = null; - try { - opt = new Options(). - setCreateIfMissing(true). - setDisableAutoCompactions(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(4). - setWriteBufferSize(100<<10). - setLevelZeroFileNumCompactionTrigger(3). - setTargetFileSizeBase(200 << 10). - setTargetFileSizeMultiplier(1). - setMaxBytesForLevelBase(500 << 10). - setMaxBytesForLevelMultiplier(1). - setDisableAutoCompactions(false); - // open database - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + try ( + final Options opt = new Options(). + setCreateIfMissing(true). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { @@ -475,67 +358,55 @@ public void compactRangeWithKeysReduce() db.flush(new FlushOptions().setWaitForFlush(true)); db.compactRange("0".getBytes(), "201".getBytes(), true, -1, 0); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void compactRangeWithKeysColumnFamily() throws RocksDBException { - RocksDB db = null; - DBOptions opt = null; - List columnFamilyHandles = - new ArrayList<>(); - try { - opt = new DBOptions(). - setCreateIfMissing(true). - setCreateMissingColumnFamilies(true); - List columnFamilyDescriptors = - new ArrayList<>(); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY)); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - "new_cf".getBytes(), - new ColumnFamilyOptions(). - setDisableAutoCompactions(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(4). - setWriteBufferSize(100<<10). - setLevelZeroFileNumCompactionTrigger(3). - setTargetFileSizeBase(200 << 10). - setTargetFileSizeMultiplier(1). - setMaxBytesForLevelBase(500 << 10). - setMaxBytesForLevelMultiplier(1). - setDisableAutoCompactions(false))); + try (final DBOptions opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false) + ) { + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts) + ); + // open database - db = RocksDB.open(opt, + final List columnFamilyHandles = + new ArrayList<>(); + try (final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, - columnFamilyHandles); - // fill database with key/value pairs - byte[] b = new byte[10000]; - for (int i = 0; i < 200; i++) { - rand.nextBytes(b); - db.put(columnFamilyHandles.get(1), - String.valueOf(i).getBytes(), b); - } - db.compactRange(columnFamilyHandles.get(1), - "0".getBytes(), "201".getBytes()); - } finally { - for (ColumnFamilyHandle handle : columnFamilyHandles) { - handle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + columnFamilyHandles)) { + try { + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.compactRange(columnFamilyHandles.get(1), + "0".getBytes(), "201".getBytes()); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } + } } } } @@ -543,54 +414,48 @@ public void compactRangeWithKeysColumnFamily() @Test public void compactRangeWithKeysReduceColumnFamily() throws RocksDBException { - RocksDB db = null; - DBOptions opt = null; - List columnFamilyHandles = - new ArrayList<>(); - try { - opt = new DBOptions(). - setCreateIfMissing(true). - setCreateMissingColumnFamilies(true); - List columnFamilyDescriptors = - new ArrayList<>(); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY)); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - "new_cf".getBytes(), - new ColumnFamilyOptions(). - setDisableAutoCompactions(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(4). - setWriteBufferSize(100<<10). - setLevelZeroFileNumCompactionTrigger(3). - setTargetFileSizeBase(200 << 10). - setTargetFileSizeMultiplier(1). - setMaxBytesForLevelBase(500 << 10). - setMaxBytesForLevelMultiplier(1). - setDisableAutoCompactions(false))); + try (final DBOptions opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false) + ) { + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts) + ); + + final List columnFamilyHandles = new ArrayList<>(); // open database - db = RocksDB.open(opt, + try (final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, - columnFamilyHandles); - // fill database with key/value pairs - byte[] b = new byte[10000]; - for (int i = 0; i < 200; i++) { - rand.nextBytes(b); - db.put(columnFamilyHandles.get(1), - String.valueOf(i).getBytes(), b); - } - db.compactRange(columnFamilyHandles.get(1), "0".getBytes(), - "201".getBytes(), true, -1, 0); - } finally { - for (ColumnFamilyHandle handle : columnFamilyHandles) { - handle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); + columnFamilyHandles)) { + try { + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.compactRange(columnFamilyHandles.get(1), "0".getBytes(), + "201".getBytes(), true, -1, 0); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } + } } } } @@ -598,35 +463,33 @@ public void compactRangeWithKeysReduceColumnFamily() @Test public void compactRangeToLevel() throws RocksDBException, InterruptedException { - RocksDB db = null; - Options opt = null; - try { - final int NUM_KEYS_PER_L0_FILE = 100; - final int KEY_SIZE = 20; - final int VALUE_SIZE = 300; - final int L0_FILE_SIZE = - NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE); - final int NUM_L0_FILES = 10; - final int TEST_SCALE = 5; - final int KEY_INTERVAL = 100; - opt = new Options(). - setCreateIfMissing(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(5). - // a slightly bigger write buffer than L0 file - // so that we can ensure manual flush always - // go before background flush happens. - setWriteBufferSize(L0_FILE_SIZE * 2). - // Disable auto L0 -> L1 compaction - setLevelZeroFileNumCompactionTrigger(20). - setTargetFileSizeBase(L0_FILE_SIZE * 100). - setTargetFileSizeMultiplier(1). - // To disable auto compaction - setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100). - setMaxBytesForLevelMultiplier(2). - setDisableAutoCompactions(true); - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); + final int NUM_KEYS_PER_L0_FILE = 100; + final int KEY_SIZE = 20; + final int VALUE_SIZE = 300; + final int L0_FILE_SIZE = + NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE); + final int NUM_L0_FILES = 10; + final int TEST_SCALE = 5; + final int KEY_INTERVAL = 100; + try (final Options opt = new Options(). + setCreateIfMissing(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(5). + // a slightly bigger write buffer than L0 file + // so that we can ensure manual flush always + // go before background flush happens. + setWriteBufferSize(L0_FILE_SIZE * 2). + // Disable auto L0 -> L1 compaction + setLevelZeroFileNumCompactionTrigger(20). + setTargetFileSizeBase(L0_FILE_SIZE * 100). + setTargetFileSizeMultiplier(1). + // To disable auto compaction + setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100). + setMaxBytesForLevelMultiplier(2). + setDisableAutoCompactions(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()) + ) { // fill database with key/value pairs byte[] value = new byte[VALUE_SIZE]; int int_key = 0; @@ -638,7 +501,7 @@ public void compactRangeToLevel() rand.nextBytes(value); db.put(String.format("%020d", int_key).getBytes(), - value); + value); } db.flush(new FlushOptions().setWaitForFlush(true)); // Make sure we do create one more L0 files. @@ -669,141 +532,134 @@ public void compactRangeToLevel() db.getProperty("rocksdb.num-files-at-level2")). isEqualTo("0"); } - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } @Test public void compactRangeToLevelColumnFamily() throws RocksDBException { - RocksDB db = null; - DBOptions opt = null; - List columnFamilyHandles = - new ArrayList<>(); - try { - final int NUM_KEYS_PER_L0_FILE = 100; - final int KEY_SIZE = 20; - final int VALUE_SIZE = 300; - final int L0_FILE_SIZE = - NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE); - final int NUM_L0_FILES = 10; - final int TEST_SCALE = 5; - final int KEY_INTERVAL = 100; - opt = new DBOptions(). - setCreateIfMissing(true). - setCreateMissingColumnFamilies(true); - List columnFamilyDescriptors = - new ArrayList<>(); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY)); - columnFamilyDescriptors.add(new ColumnFamilyDescriptor( - "new_cf".getBytes(), - new ColumnFamilyOptions(). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(5). - // a slightly bigger write buffer than L0 file - // so that we can ensure manual flush always - // go before background flush happens. - setWriteBufferSize(L0_FILE_SIZE * 2). - // Disable auto L0 -> L1 compaction - setLevelZeroFileNumCompactionTrigger(20). - setTargetFileSizeBase(L0_FILE_SIZE * 100). - setTargetFileSizeMultiplier(1). - // To disable auto compaction - setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100). - setMaxBytesForLevelMultiplier(2). - setDisableAutoCompactions(true))); + final int NUM_KEYS_PER_L0_FILE = 100; + final int KEY_SIZE = 20; + final int VALUE_SIZE = 300; + final int L0_FILE_SIZE = + NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE); + final int NUM_L0_FILES = 10; + final int TEST_SCALE = 5; + final int KEY_INTERVAL = 100; + + try (final DBOptions opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions(). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(5). + // a slightly bigger write buffer than L0 file + // so that we can ensure manual flush always + // go before background flush happens. + setWriteBufferSize(L0_FILE_SIZE * 2). + // Disable auto L0 -> L1 compaction + setLevelZeroFileNumCompactionTrigger(20). + setTargetFileSizeBase(L0_FILE_SIZE * 100). + setTargetFileSizeMultiplier(1). + // To disable auto compaction + setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100). + setMaxBytesForLevelMultiplier(2). + setDisableAutoCompactions(true) + ) { + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts) + ); + + final List columnFamilyHandles = new ArrayList<>(); // open database - db = RocksDB.open(opt, + try (final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, - columnFamilyHandles); - // fill database with key/value pairs - byte[] value = new byte[VALUE_SIZE]; - int int_key = 0; - for (int round = 0; round < 5; ++round) { - int initial_key = int_key; - for (int f = 1; f <= NUM_L0_FILES; ++f) { - for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) { - int_key += KEY_INTERVAL; - rand.nextBytes(value); + columnFamilyHandles)) { + try { + // fill database with key/value pairs + byte[] value = new byte[VALUE_SIZE]; + int int_key = 0; + for (int round = 0; round < 5; ++round) { + int initial_key = int_key; + for (int f = 1; f <= NUM_L0_FILES; ++f) { + for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) { + int_key += KEY_INTERVAL; + rand.nextBytes(value); - db.put(columnFamilyHandles.get(1), - String.format("%020d", int_key).getBytes(), - value); + db.put(columnFamilyHandles.get(1), + String.format("%020d", int_key).getBytes(), + value); + } + db.flush(new FlushOptions().setWaitForFlush(true), + columnFamilyHandles.get(1)); + // Make sure we do create one more L0 files. + assertThat( + db.getProperty(columnFamilyHandles.get(1), + "rocksdb.num-files-at-level0")). + isEqualTo("" + f); + } + + // Compact all L0 files we just created + db.compactRange( + columnFamilyHandles.get(1), + String.format("%020d", initial_key).getBytes(), + String.format("%020d", int_key - 1).getBytes()); + // Making sure there isn't any L0 files. + assertThat( + db.getProperty(columnFamilyHandles.get(1), + "rocksdb.num-files-at-level0")). + isEqualTo("0"); + // Making sure there are some L1 files. + // Here we only use != 0 instead of a specific number + // as we don't want the test make any assumption on + // how compaction works. + assertThat( + db.getProperty(columnFamilyHandles.get(1), + "rocksdb.num-files-at-level1")). + isNotEqualTo("0"); + // Because we only compacted those keys we issued + // in this round, there shouldn't be any L1 -> L2 + // compaction. So we expect zero L2 files here. + assertThat( + db.getProperty(columnFamilyHandles.get(1), + "rocksdb.num-files-at-level2")). + isEqualTo("0"); + } + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); } - db.flush(new FlushOptions().setWaitForFlush(true), - columnFamilyHandles.get(1)); - // Make sure we do create one more L0 files. - assertThat( - db.getProperty(columnFamilyHandles.get(1), - "rocksdb.num-files-at-level0")). - isEqualTo("" + f); } - - // Compact all L0 files we just created - db.compactRange( - columnFamilyHandles.get(1), - String.format("%020d", initial_key).getBytes(), - String.format("%020d", int_key - 1).getBytes()); - // Making sure there isn't any L0 files. - assertThat( - db.getProperty(columnFamilyHandles.get(1), - "rocksdb.num-files-at-level0")). - isEqualTo("0"); - // Making sure there are some L1 files. - // Here we only use != 0 instead of a specific number - // as we don't want the test make any assumption on - // how compaction works. - assertThat( - db.getProperty(columnFamilyHandles.get(1), - "rocksdb.num-files-at-level1")). - isNotEqualTo("0"); - // Because we only compacted those keys we issued - // in this round, there shouldn't be any L1 -> L2 - // compaction. So we expect zero L2 files here. - assertThat( - db.getProperty(columnFamilyHandles.get(1), - "rocksdb.num-files-at-level2")). - isEqualTo("0"); - } - } finally { - for (ColumnFamilyHandle handle : columnFamilyHandles) { - handle.dispose(); - } - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); } } } + @Test + public void pauseContinueBackgroundWork() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { + db.pauseBackgroundWork(); + db.continueBackgroundWork(); + db.pauseBackgroundWork(); + db.continueBackgroundWork(); + } + } + @Test public void enableDisableFileDeletions() throws RocksDBException { - RocksDB db = null; - Options options = null; - try { - options = new Options().setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { db.disableFileDeletions(); db.enableFileDeletions(false); db.disableFileDeletions(); db.enableFileDeletions(true); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java index 5914e6e294..d89570aad2 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,22 +17,23 @@ public class RocksEnvTest { new RocksMemoryResource(); @Test - public void rocksEnv(){ - Env rocksEnv = RocksEnv.getDefault(); - rocksEnv.setBackgroundThreads(5); - // default rocksenv will always return zero for flush pool - // no matter what was set via setBackgroundThreads - assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)). - isEqualTo(0); - rocksEnv.setBackgroundThreads(5, RocksEnv.FLUSH_POOL); - // default rocksenv will always return zero for flush pool - // no matter what was set via setBackgroundThreads - assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)). - isEqualTo(0); - rocksEnv.setBackgroundThreads(5, RocksEnv.COMPACTION_POOL); - // default rocksenv will always return zero for compaction pool - // no matter what was set via setBackgroundThreads - assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.COMPACTION_POOL)). - isEqualTo(0); + public void rocksEnv() { + try (final Env rocksEnv = RocksEnv.getDefault()) { + rocksEnv.setBackgroundThreads(5); + // default rocksenv will always return zero for flush pool + // no matter what was set via setBackgroundThreads + assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)). + isEqualTo(0); + rocksEnv.setBackgroundThreads(5, RocksEnv.FLUSH_POOL); + // default rocksenv will always return zero for flush pool + // no matter what was set via setBackgroundThreads + assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)). + isEqualTo(0); + rocksEnv.setBackgroundThreads(5, RocksEnv.COMPACTION_POOL); + // default rocksenv will always return zero for compaction pool + // no matter what was set via setBackgroundThreads + assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.COMPACTION_POOL)). + isEqualTo(0); + } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java index 170170f5c1..4471df9cce 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,50 +22,36 @@ public class RocksIteratorTest { @Test public void rocksIterator() throws RocksDBException { - RocksDB db = null; - Options options = null; - RocksIterator iterator = null; - try { - options = new Options(); - options.setCreateIfMissing(true) - .setCreateMissingColumnFamilies(true); - db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { db.put("key1".getBytes(), "value1".getBytes()); db.put("key2".getBytes(), "value2".getBytes()); - iterator = db.newIterator(); - - iterator.seekToFirst(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key1".getBytes()); - assertThat(iterator.value()).isEqualTo("value1".getBytes()); - iterator.next(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key2".getBytes()); - assertThat(iterator.value()).isEqualTo("value2".getBytes()); - iterator.next(); - assertThat(iterator.isValid()).isFalse(); - iterator.seekToLast(); - iterator.prev(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key1".getBytes()); - assertThat(iterator.value()).isEqualTo("value1".getBytes()); - iterator.seekToFirst(); - iterator.seekToLast(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key2".getBytes()); - assertThat(iterator.value()).isEqualTo("value2".getBytes()); - iterator.status(); - } finally { - if (iterator != null) { - iterator.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + assertThat(iterator.value()).isEqualTo("value2".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + iterator.seekToLast(); + iterator.prev(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + iterator.seekToFirst(); + iterator.seekToLast(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + assertThat(iterator.value()).isEqualTo("value2".getBytes()); + iterator.status(); } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java index d2791c93ea..141f7f8506 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,73 +33,55 @@ public void memEnvFillAndReopen() throws RocksDBException { "baz".getBytes() }; - Env env = null; - Options options = null; - RocksDB db = null; - FlushOptions flushOptions = null; - try { - env = new RocksMemEnv(); - options = new Options(). - setCreateIfMissing(true). - setEnv(env); - flushOptions = new FlushOptions(). - setWaitForFlush(true); - db = RocksDB.open(options, "dir/db"); - - // write key/value pairs using MemEnv - for (int i=0; i < keys.length; i++) { - db.put(keys[i], values[i]); + try (final Env env = new RocksMemEnv(); + final Options options = new Options() + .setCreateIfMissing(true) + .setEnv(env); + final FlushOptions flushOptions = new FlushOptions() + .setWaitForFlush(true); + ) { + try (final RocksDB db = RocksDB.open(options, "dir/db")) { + // write key/value pairs using MemEnv + for (int i = 0; i < keys.length; i++) { + db.put(keys[i], values[i]); + } + + // read key/value pairs using MemEnv + for (int i = 0; i < keys.length; i++) { + assertThat(db.get(keys[i])).isEqualTo(values[i]); + } + + // Check iterator access + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + for (int i = 0; i < keys.length; i++) { + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo(keys[i]); + assertThat(iterator.value()).isEqualTo(values[i]); + iterator.next(); + } + // reached end of database + assertThat(iterator.isValid()).isFalse(); + } + + // flush + db.flush(flushOptions); + + // read key/value pairs after flush using MemEnv + for (int i = 0; i < keys.length; i++) { + assertThat(db.get(keys[i])).isEqualTo(values[i]); + } } - // read key/value pairs using MemEnv - for (int i=0; i < keys.length; i++) { - assertThat(db.get(keys[i])).isEqualTo(values[i]); - } - - // Check iterator access - RocksIterator iterator = db.newIterator(); - iterator.seekToFirst(); - for (int i=0; i < keys.length; i++) { - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo(keys[i]); - assertThat(iterator.value()).isEqualTo(values[i]); - iterator.next(); - } - // reached end of database - assertThat(iterator.isValid()).isFalse(); - iterator.dispose(); - - // flush - db.flush(flushOptions); - - // read key/value pairs after flush using MemEnv - for (int i=0; i < keys.length; i++) { - assertThat(db.get(keys[i])).isEqualTo(values[i]); - } - - db.close(); options.setCreateIfMissing(false); // After reopen the values shall still be in the mem env. // as long as the env is not freed. - db = RocksDB.open(options, "dir/db"); - // read key/value pairs using MemEnv - for (int i=0; i < keys.length; i++) { - assertThat(db.get(keys[i])).isEqualTo(values[i]); - } - - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (flushOptions != null) { - flushOptions.dispose(); - } - if (env != null) { - env.dispose(); + try (final RocksDB db = RocksDB.open(options, "dir/db")) { + // read key/value pairs using MemEnv + for (int i = 0; i < keys.length; i++) { + assertThat(db.get(keys[i])).isEqualTo(values[i]); + } } } } @@ -125,27 +107,22 @@ public void multipleDatabaseInstances() throws RocksDBException { "baz".getBytes() }; - Env env = null; - Options options = null; - RocksDB db = null, otherDb = null; - - try { - env = new RocksMemEnv(); - options = new Options(). - setCreateIfMissing(true). - setEnv(env); - db = RocksDB.open(options, "dir/db"); - otherDb = RocksDB.open(options, "dir/otherDb"); - + try (final Env env = new RocksMemEnv(); + final Options options = new Options() + .setCreateIfMissing(true) + .setEnv(env); + final RocksDB db = RocksDB.open(options, "dir/db"); + final RocksDB otherDb = RocksDB.open(options, "dir/otherDb") + ) { // write key/value pairs using MemEnv // to db and to otherDb. - for (int i=0; i < keys.length; i++) { + for (int i = 0; i < keys.length; i++) { db.put(keys[i], values[i]); otherDb.put(otherKeys[i], values[i]); } // verify key/value pairs after flush using MemEnv - for (int i=0; i < keys.length; i++) { + for (int i = 0; i < keys.length; i++) { // verify db assertThat(db.get(otherKeys[i])).isNull(); assertThat(db.get(keys[i])).isEqualTo(values[i]); @@ -154,43 +131,18 @@ public void multipleDatabaseInstances() throws RocksDBException { assertThat(otherDb.get(keys[i])).isNull(); assertThat(otherDb.get(otherKeys[i])).isEqualTo(values[i]); } - } finally { - if (db != null) { - db.close(); - } - if (otherDb != null) { - otherDb.close(); - } - if (options != null) { - options.dispose(); - } - if (env != null) { - env.dispose(); - } } } @Test(expected = RocksDBException.class) public void createIfMissingFalse() throws RocksDBException { - Env env = null; - Options options = null; - RocksDB db = null; - - try { - env = new RocksMemEnv(); - options = new Options(). - setCreateIfMissing(false). - setEnv(env); + try (final Env env = new RocksMemEnv(); + final Options options = new Options() + .setCreateIfMissing(false) + .setEnv(env); + final RocksDB db = RocksDB.open(options, "db/dir")) { // shall throw an exception because db dir does not // exist. - db = RocksDB.open(options, "db/dir"); - } finally { - if (options != null) { - options.dispose(); - } - if (env != null) { - env.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java b/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java index de9ba0d6b6..6fd1c7e667 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java @@ -5,7 +5,11 @@ /** * Resource to trigger garbage collection after each test * run. + * + * @deprecated Will be removed with the implementation of + * {@link RocksObject#finalize()} */ +@Deprecated public class RocksMemoryResource extends ExternalResource { static { diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java index fbd602b14a..952c9ab866 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,89 +17,45 @@ public class SliceTest { @Test public void slice() { - Slice slice = null; - Slice otherSlice = null; - Slice thirdSlice = null; - try { - slice = new Slice("testSlice"); + try (final Slice slice = new Slice("testSlice")) { assertThat(slice.empty()).isFalse(); assertThat(slice.size()).isEqualTo(9); assertThat(slice.data()).isEqualTo("testSlice".getBytes()); + } - otherSlice = new Slice("otherSlice".getBytes()); + try (final Slice otherSlice = new Slice("otherSlice".getBytes())) { assertThat(otherSlice.data()).isEqualTo("otherSlice".getBytes()); + } - thirdSlice = new Slice("otherSlice".getBytes(), 5); + try (final Slice thirdSlice = new Slice("otherSlice".getBytes(), 5)) { assertThat(thirdSlice.data()).isEqualTo("Slice".getBytes()); - } finally { - if (slice != null) { - slice.dispose(); - } - if (otherSlice != null) { - otherSlice.dispose(); - } - if (thirdSlice != null) { - thirdSlice.dispose(); - } } } @Test public void sliceEquals() { - Slice slice = null; - Slice slice2 = null; - try { - slice = new Slice("abc"); - slice2 = new Slice("abc"); + try (final Slice slice = new Slice("abc"); + final Slice slice2 = new Slice("abc")) { assertThat(slice.equals(slice2)).isTrue(); assertThat(slice.hashCode() == slice2.hashCode()).isTrue(); - } finally { - if (slice != null) { - slice.dispose(); - } - if (slice2 != null) { - slice2.dispose(); - } } } - @Test public void sliceStartWith() { - Slice slice = null; - Slice match = null; - Slice noMatch = null; - try { - slice = new Slice("matchpoint"); - match = new Slice("mat"); - noMatch = new Slice("nomatch"); - - //assertThat(slice.startsWith(match)).isTrue(); + try (final Slice slice = new Slice("matchpoint"); + final Slice match = new Slice("mat"); + final Slice noMatch = new Slice("nomatch")) { + assertThat(slice.startsWith(match)).isTrue(); assertThat(slice.startsWith(noMatch)).isFalse(); - } finally { - if (slice != null) { - slice.dispose(); - } - if (match != null) { - match.dispose(); - } - if (noMatch != null) { - noMatch.dispose(); - } } } @Test public void sliceToString() { - Slice slice = null; - try { - slice = new Slice("stringTest"); + try (final Slice slice = new Slice("stringTest")) { assertThat(slice.toString()).isEqualTo("stringTest"); assertThat(slice.toString(true)).isNotEqualTo(""); - } finally { - if (slice != null) { - slice.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java index 87ccdbcb5d..581bae50b9 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,195 +22,147 @@ public class SnapshotTest { @Test public void snapshots() throws RocksDBException { - RocksDB db = null; - Options options = null; - ReadOptions readOptions = null; - try { - - options = new Options(); - options.setCreateIfMissing(true); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { db.put("key".getBytes(), "value".getBytes()); // Get new Snapshot of database - Snapshot snapshot = db.getSnapshot(); - assertThat(snapshot.getSequenceNumber()).isGreaterThan(0); - assertThat(snapshot.getSequenceNumber()).isEqualTo(1); - readOptions = new ReadOptions(); - // set snapshot in ReadOptions - readOptions.setSnapshot(snapshot); - // retrieve key value pair - assertThat(new String(db.get("key".getBytes()))). - isEqualTo("value"); - // retrieve key value pair created before - // the snapshot was made - assertThat(new String(db.get(readOptions, - "key".getBytes()))).isEqualTo("value"); - // add new key/value pair - db.put("newkey".getBytes(), "newvalue".getBytes()); - // using no snapshot the latest db entries - // will be taken into account - assertThat(new String(db.get("newkey".getBytes()))). - isEqualTo("newvalue"); - // snapshopot was created before newkey - assertThat(db.get(readOptions, "newkey".getBytes())). - isNull(); - // Retrieve snapshot from read options - Snapshot sameSnapshot = readOptions.snapshot(); - readOptions.setSnapshot(sameSnapshot); - // results must be the same with new Snapshot - // instance using the same native pointer - assertThat(new String(db.get(readOptions, - "key".getBytes()))).isEqualTo("value"); - // update key value pair to newvalue - db.put("key".getBytes(), "newvalue".getBytes()); - // read with previously created snapshot will - // read previous version of key value pair - assertThat(new String(db.get(readOptions, - "key".getBytes()))).isEqualTo("value"); - // read for newkey using the snapshot must be - // null - assertThat(db.get(readOptions, "newkey".getBytes())). - isNull(); - // setting null to snapshot in ReadOptions leads - // to no Snapshot being used. - readOptions.setSnapshot(null); - assertThat(new String(db.get(readOptions, - "newkey".getBytes()))).isEqualTo("newvalue"); - // release Snapshot - db.releaseSnapshot(snapshot); - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (readOptions != null) { - readOptions.dispose(); + try (final Snapshot snapshot = db.getSnapshot()) { + assertThat(snapshot.getSequenceNumber()).isGreaterThan(0); + assertThat(snapshot.getSequenceNumber()).isEqualTo(1); + try (final ReadOptions readOptions = new ReadOptions()) { + // set snapshot in ReadOptions + readOptions.setSnapshot(snapshot); + + // retrieve key value pair + assertThat(new String(db.get("key".getBytes()))). + isEqualTo("value"); + // retrieve key value pair created before + // the snapshot was made + assertThat(new String(db.get(readOptions, + "key".getBytes()))).isEqualTo("value"); + // add new key/value pair + db.put("newkey".getBytes(), "newvalue".getBytes()); + // using no snapshot the latest db entries + // will be taken into account + assertThat(new String(db.get("newkey".getBytes()))). + isEqualTo("newvalue"); + // snapshopot was created before newkey + assertThat(db.get(readOptions, "newkey".getBytes())). + isNull(); + // Retrieve snapshot from read options + try (final Snapshot sameSnapshot = readOptions.snapshot()) { + readOptions.setSnapshot(sameSnapshot); + // results must be the same with new Snapshot + // instance using the same native pointer + assertThat(new String(db.get(readOptions, + "key".getBytes()))).isEqualTo("value"); + // update key value pair to newvalue + db.put("key".getBytes(), "newvalue".getBytes()); + // read with previously created snapshot will + // read previous version of key value pair + assertThat(new String(db.get(readOptions, + "key".getBytes()))).isEqualTo("value"); + // read for newkey using the snapshot must be + // null + assertThat(db.get(readOptions, "newkey".getBytes())). + isNull(); + // setting null to snapshot in ReadOptions leads + // to no Snapshot being used. + readOptions.setSnapshot(null); + assertThat(new String(db.get(readOptions, + "newkey".getBytes()))).isEqualTo("newvalue"); + // release Snapshot + db.releaseSnapshot(snapshot); + } + } } } } @Test public void iteratorWithSnapshot() throws RocksDBException { - RocksDB db = null; - Options options = null; - ReadOptions readOptions = null; - RocksIterator iterator = null; - RocksIterator snapshotIterator = null; - try { - - options = new Options(); - options.setCreateIfMissing(true); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { db.put("key".getBytes(), "value".getBytes()); + // Get new Snapshot of database - Snapshot snapshot = db.getSnapshot(); - readOptions = new ReadOptions(); // set snapshot in ReadOptions - readOptions.setSnapshot(snapshot); - db.put("key2".getBytes(), "value2".getBytes()); - - // iterate over current state of db - iterator = db.newIterator(); - iterator.seekToFirst(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key".getBytes()); - iterator.next(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key2".getBytes()); - iterator.next(); - assertThat(iterator.isValid()).isFalse(); - - // iterate using a snapshot - snapshotIterator = db.newIterator(readOptions); - snapshotIterator.seekToFirst(); - assertThat(snapshotIterator.isValid()).isTrue(); - assertThat(snapshotIterator.key()).isEqualTo("key".getBytes()); - snapshotIterator.next(); - assertThat(snapshotIterator.isValid()).isFalse(); - - // release Snapshot - db.releaseSnapshot(snapshot); - } finally { - if (iterator != null) { - iterator.dispose(); - } - if (snapshotIterator != null) { - snapshotIterator.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (readOptions != null) { - readOptions.dispose(); + try (final Snapshot snapshot = db.getSnapshot(); + final ReadOptions readOptions = + new ReadOptions().setSnapshot(snapshot)) { + db.put("key2".getBytes(), "value2".getBytes()); + + // iterate over current state of db + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + } + + // iterate using a snapshot + try (final RocksIterator snapshotIterator = + db.newIterator(readOptions)) { + snapshotIterator.seekToFirst(); + assertThat(snapshotIterator.isValid()).isTrue(); + assertThat(snapshotIterator.key()).isEqualTo("key".getBytes()); + snapshotIterator.next(); + assertThat(snapshotIterator.isValid()).isFalse(); + } + + // release Snapshot + db.releaseSnapshot(snapshot); } } } @Test public void iteratorWithSnapshotOnColumnFamily() throws RocksDBException { - RocksDB db = null; - Options options = null; - ReadOptions readOptions = null; - RocksIterator iterator = null; - RocksIterator snapshotIterator = null; - try { + try (final Options options = new Options() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { - options = new Options(); - options.setCreateIfMissing(true); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); db.put("key".getBytes(), "value".getBytes()); + // Get new Snapshot of database - Snapshot snapshot = db.getSnapshot(); - readOptions = new ReadOptions(); // set snapshot in ReadOptions - readOptions.setSnapshot(snapshot); - db.put("key2".getBytes(), "value2".getBytes()); - - // iterate over current state of column family - iterator = db.newIterator(db.getDefaultColumnFamily()); - iterator.seekToFirst(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key".getBytes()); - iterator.next(); - assertThat(iterator.isValid()).isTrue(); - assertThat(iterator.key()).isEqualTo("key2".getBytes()); - iterator.next(); - assertThat(iterator.isValid()).isFalse(); - - // iterate using a snapshot on default column family - snapshotIterator = db.newIterator(db.getDefaultColumnFamily(), - readOptions); - snapshotIterator.seekToFirst(); - assertThat(snapshotIterator.isValid()).isTrue(); - assertThat(snapshotIterator.key()).isEqualTo("key".getBytes()); - snapshotIterator.next(); - assertThat(snapshotIterator.isValid()).isFalse(); - - // release Snapshot - db.releaseSnapshot(snapshot); - } finally { - if (iterator != null) { - iterator.dispose(); - } - if (snapshotIterator != null) { - snapshotIterator.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } - if (readOptions != null) { - readOptions.dispose(); + try (final Snapshot snapshot = db.getSnapshot(); + final ReadOptions readOptions = new ReadOptions() + .setSnapshot(snapshot)) { + db.put("key2".getBytes(), "value2".getBytes()); + + // iterate over current state of column family + try (final RocksIterator iterator = db.newIterator( + db.getDefaultColumnFamily())) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + } + + // iterate using a snapshot on default column family + try (final RocksIterator snapshotIterator = db.newIterator( + db.getDefaultColumnFamily(), readOptions)) { + snapshotIterator.seekToFirst(); + assertThat(snapshotIterator.isValid()).isTrue(); + assertThat(snapshotIterator.key()).isEqualTo("key".getBytes()); + snapshotIterator.next(); + assertThat(snapshotIterator.isValid()).isFalse(); + + // release Snapshot + db.releaseSnapshot(snapshot); + } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java index 927826d71a..9f014d1d3f 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,19 +26,18 @@ public class StatisticsCollectorTest { @Test public void statisticsCollector() throws InterruptedException, RocksDBException { - Options opt = null; - RocksDB db = null; - try { - opt = new Options().createStatistics().setCreateIfMissing(true); - Statistics stats = opt.statisticsPtr(); - - db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()); - - StatsCallbackMock callback = new StatsCallbackMock(); - StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback); - - StatisticsCollector statsCollector = new StatisticsCollector( + try (final Options opt = new Options() + .createStatistics() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath())) { + final Statistics stats = opt.statisticsPtr(); + + final StatsCallbackMock callback = new StatsCallbackMock(); + final StatsCollectorInput statsInput = + new StatsCollectorInput(stats, callback); + + final StatisticsCollector statsCollector = new StatisticsCollector( Collections.singletonList(statsInput), 100); statsCollector.start(); @@ -48,13 +47,6 @@ public void statisticsCollector() assertThat(callback.histCallbackCount).isGreaterThan(0); statsCollector.shutDown(1000); - } finally { - if (db != null) { - db.close(); - } - if (opt != null) { - opt.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java b/external/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java index 3c5800e426..2e28f28efa 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java index 1de2efdeaf..b619258ecc 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java @@ -17,43 +17,27 @@ public class TransactionLogIteratorTest { @Test public void transactionLogIterator() throws RocksDBException { - RocksDB db = null; - Options options = null; - TransactionLogIterator transactionLogIterator = null; - try { - options = new Options(). - setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - transactionLogIterator = db.getUpdatesSince(0); - } finally { - if (transactionLogIterator != null) { - transactionLogIterator.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } + try (final Options options = new Options() + .setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + final TransactionLogIterator transactionLogIterator = + db.getUpdatesSince(0)) { + //no-op } } @Test public void getBatch() throws RocksDBException { final int numberOfPuts = 5; - RocksDB db = null; - Options options = null; - ColumnFamilyHandle cfHandle = null; - TransactionLogIterator transactionLogIterator = null; - try { - options = new Options(). - setCreateIfMissing(true). - setWalTtlSeconds(1000). - setWalSizeLimitMB(10); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - - for (int i = 0; i < numberOfPuts; i++){ + try (final Options options = new Options() + .setCreateIfMissing(true) + .setWalTtlSeconds(1000) + .setWalSizeLimitMB(10); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + + for (int i = 0; i < numberOfPuts; i++) { db.put(String.valueOf(i).getBytes(), String.valueOf(i).getBytes()); } @@ -65,117 +49,89 @@ public void getBatch() throws RocksDBException { isEqualTo(numberOfPuts); // insert 5 writes into a cf - cfHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf".getBytes())); - - for (int i = 0; i < numberOfPuts; i++){ - db.put(cfHandle, String.valueOf(i).getBytes(), - String.valueOf(i).getBytes()); - } - // the latest sequence number is 10 because - // (5 + 5) puts were written beforehand - assertThat(db.getLatestSequenceNumber()). - isEqualTo(numberOfPuts + numberOfPuts); - - // Get updates since the beginning - transactionLogIterator = db.getUpdatesSince(0); - assertThat(transactionLogIterator.isValid()).isTrue(); - transactionLogIterator.status(); - - // The first sequence number is 1 - TransactionLogIterator.BatchResult batchResult = - transactionLogIterator.getBatch(); - assertThat(batchResult.sequenceNumber()).isEqualTo(1); - } finally { - if (transactionLogIterator != null) { - transactionLogIterator.dispose(); - } - if (cfHandle != null) { - cfHandle.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + try (final ColumnFamilyHandle cfHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf".getBytes()))) { + for (int i = 0; i < numberOfPuts; i++) { + db.put(cfHandle, String.valueOf(i).getBytes(), + String.valueOf(i).getBytes()); + } + // the latest sequence number is 10 because + // (5 + 5) puts were written beforehand + assertThat(db.getLatestSequenceNumber()). + isEqualTo(numberOfPuts + numberOfPuts); + + // Get updates since the beginning + try (final TransactionLogIterator transactionLogIterator = + db.getUpdatesSince(0)) { + assertThat(transactionLogIterator.isValid()).isTrue(); + transactionLogIterator.status(); + + // The first sequence number is 1 + final TransactionLogIterator.BatchResult batchResult = + transactionLogIterator.getBatch(); + assertThat(batchResult.sequenceNumber()).isEqualTo(1); + } } } } @Test - public void transactionLogIteratorStallAtLastRecord() throws RocksDBException { - RocksDB db = null; - Options options = null; - TransactionLogIterator transactionLogIterator = null; - try { - options = new Options(). - setCreateIfMissing(true). - setWalTtlSeconds(1000). - setWalSizeLimitMB(10); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + public void transactionLogIteratorStallAtLastRecord() + throws RocksDBException { + try (final Options options = new Options() + .setCreateIfMissing(true) + .setWalTtlSeconds(1000) + .setWalSizeLimitMB(10); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); // Get updates since the beginning - transactionLogIterator = db.getUpdatesSince(0); - transactionLogIterator.status(); - assertThat(transactionLogIterator.isValid()).isTrue(); - transactionLogIterator.next(); - assertThat(transactionLogIterator.isValid()).isFalse(); - transactionLogIterator.status(); - db.put("key2".getBytes(), "value2".getBytes()); - transactionLogIterator.next(); - transactionLogIterator.status(); - assertThat(transactionLogIterator.isValid()).isTrue(); - - } finally { - if (transactionLogIterator != null) { - transactionLogIterator.dispose(); - } - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); + try (final TransactionLogIterator transactionLogIterator = + db.getUpdatesSince(0)) { + transactionLogIterator.status(); + assertThat(transactionLogIterator.isValid()).isTrue(); + transactionLogIterator.next(); + assertThat(transactionLogIterator.isValid()).isFalse(); + transactionLogIterator.status(); + db.put("key2".getBytes(), "value2".getBytes()); + transactionLogIterator.next(); + transactionLogIterator.status(); + assertThat(transactionLogIterator.isValid()).isTrue(); } } } @Test - public void transactionLogIteratorCheckAfterRestart() throws RocksDBException { + public void transactionLogIteratorCheckAfterRestart() + throws RocksDBException { final int numberOfKeys = 2; - RocksDB db = null; - Options options = null; - TransactionLogIterator transactionLogIterator = null; - try { - options = new Options(). - setCreateIfMissing(true). - setWalTtlSeconds(1000). - setWalSizeLimitMB(10); - - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - db.put("key1".getBytes(), "value1".getBytes()); - db.put("key2".getBytes(), "value2".getBytes()); - db.flush(new FlushOptions().setWaitForFlush(true)); - // reopen - db.close(); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); - assertThat(db.getLatestSequenceNumber()).isEqualTo(numberOfKeys); + try (final Options options = new Options() + .setCreateIfMissing(true) + .setWalTtlSeconds(1000) + .setWalSizeLimitMB(10)) { + + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + db.flush(new FlushOptions().setWaitForFlush(true)); - transactionLogIterator = db.getUpdatesSince(0); - for (int i = 0; i < numberOfKeys; i++) { - transactionLogIterator.status(); - assertThat(transactionLogIterator.isValid()).isTrue(); - transactionLogIterator.next(); - } - } finally { - if (transactionLogIterator != null) { - transactionLogIterator.dispose(); - } - if (db != null) { - db.close(); } - if (options != null) { - options.dispose(); + + // reopen + try (final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + assertThat(db.getLatestSequenceNumber()).isEqualTo(numberOfKeys); + + try (final TransactionLogIterator transactionLogIterator = + db.getUpdatesSince(0)) { + for (int i = 0; i < numberOfKeys; i++) { + transactionLogIterator.status(); + assertThat(transactionLogIterator.isValid()).isTrue(); + transactionLogIterator.next(); + } + } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java index c60b1d5127..6539ea4e01 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,6 +11,7 @@ import org.junit.rules.TemporaryFolder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeUnit; @@ -26,108 +27,74 @@ public class TtlDBTest { public TemporaryFolder dbFolder = new TemporaryFolder(); @Test - public void ttlDBOpen() throws RocksDBException, - InterruptedException { - Options options = null; - TtlDB ttlDB = null; - try { - options = new Options(). - setCreateIfMissing(true). - setMaxGrandparentOverlapFactor(0); - ttlDB = TtlDB.open(options, - dbFolder.getRoot().getAbsolutePath()); + public void ttlDBOpen() throws RocksDBException, InterruptedException { + try (final Options options = new Options() + .setCreateIfMissing(true) + .setMaxGrandparentOverlapFactor(0); + final TtlDB ttlDB = TtlDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { ttlDB.put("key".getBytes(), "value".getBytes()); assertThat(ttlDB.get("key".getBytes())). isEqualTo("value".getBytes()); assertThat(ttlDB.get("key".getBytes())).isNotNull(); - } finally { - if (ttlDB != null) { - ttlDB.close(); - } - if (options != null) { - options.dispose(); - } } } @Test - public void ttlDBOpenWithTtl() throws RocksDBException, - InterruptedException { - Options options = null; - TtlDB ttlDB = null; - try { - options = new Options(). - setCreateIfMissing(true). - setMaxGrandparentOverlapFactor(0); - ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(), - 1, false); + public void ttlDBOpenWithTtl() throws RocksDBException, InterruptedException { + try (final Options options = new Options() + .setCreateIfMissing(true) + .setMaxGrandparentOverlapFactor(0); + final TtlDB ttlDB = TtlDB.open(options, + dbFolder.getRoot().getAbsolutePath(), 1, false); + ) { ttlDB.put("key".getBytes(), "value".getBytes()); assertThat(ttlDB.get("key".getBytes())). isEqualTo("value".getBytes()); TimeUnit.SECONDS.sleep(2); - ttlDB.compactRange(); assertThat(ttlDB.get("key".getBytes())).isNull(); - } finally { - if (ttlDB != null) { - ttlDB.close(); - } - if (options != null) { - options.dispose(); - } } } @Test - public void ttlDbOpenWithColumnFamilies() throws RocksDBException, InterruptedException { - DBOptions dbOptions = null; - TtlDB ttlDB = null; - List cfNames = - new ArrayList<>(); - List columnFamilyHandleList = - new ArrayList<>(); - cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); - cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); - List ttlValues = new ArrayList<>(); - // Default column family with infinite lifetime - ttlValues.add(0); - // new column family with 1 second ttl - ttlValues.add(1); - - try { - dbOptions = new DBOptions(). - setCreateMissingColumnFamilies(true). - setCreateIfMissing(true); - ttlDB = TtlDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), - cfNames, columnFamilyHandleList, ttlValues, false); - - ttlDB.put("key".getBytes(), "value".getBytes()); - assertThat(ttlDB.get("key".getBytes())). - isEqualTo("value".getBytes()); - ttlDB.put(columnFamilyHandleList.get(1), "key".getBytes(), - "value".getBytes()); - assertThat(ttlDB.get(columnFamilyHandleList.get(1), - "key".getBytes())).isEqualTo("value".getBytes()); - TimeUnit.SECONDS.sleep(2); - - ttlDB.compactRange(); - ttlDB.compactRange(columnFamilyHandleList.get(1)); - - assertThat(ttlDB.get("key".getBytes())).isNotNull(); - assertThat(ttlDB.get(columnFamilyHandleList.get(1), - "key".getBytes())).isNull(); - - - } finally { - for (ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.dispose(); - } - if (ttlDB != null) { - ttlDB.close(); - } - if (dbOptions != null) { - dbOptions.dispose(); + public void ttlDbOpenWithColumnFamilies() throws RocksDBException, + InterruptedException { + final List cfNames = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes()) + ); + final List ttlValues = Arrays.asList(0, 1); + + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions dbOptions = new DBOptions() + .setCreateMissingColumnFamilies(true) + .setCreateIfMissing(true); + final TtlDB ttlDB = TtlDB.open(dbOptions, + dbFolder.getRoot().getAbsolutePath(), cfNames, + columnFamilyHandleList, ttlValues, false)) { + try { + ttlDB.put("key".getBytes(), "value".getBytes()); + assertThat(ttlDB.get("key".getBytes())). + isEqualTo("value".getBytes()); + ttlDB.put(columnFamilyHandleList.get(1), "key".getBytes(), + "value".getBytes()); + assertThat(ttlDB.get(columnFamilyHandleList.get(1), + "key".getBytes())).isEqualTo("value".getBytes()); + TimeUnit.SECONDS.sleep(2); + + ttlDB.compactRange(); + ttlDB.compactRange(columnFamilyHandleList.get(1)); + + assertThat(ttlDB.get("key".getBytes())).isNotNull(); + assertThat(ttlDB.get(columnFamilyHandleList.get(1), + "key".getBytes())).isNull(); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.close(); + } } } } @@ -135,15 +102,12 @@ public void ttlDbOpenWithColumnFamilies() throws RocksDBException, InterruptedEx @Test public void createTtlColumnFamily() throws RocksDBException, InterruptedException { - Options options = null; - TtlDB ttlDB = null; - ColumnFamilyHandle columnFamilyHandle = null; - try { - options = new Options().setCreateIfMissing(true); - ttlDB = TtlDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - columnFamilyHandle = ttlDB.createColumnFamilyWithTtl( - new ColumnFamilyDescriptor("new_cf".getBytes()), 1); + try (final Options options = new Options().setCreateIfMissing(true); + final TtlDB ttlDB = TtlDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + final ColumnFamilyHandle columnFamilyHandle = + ttlDB.createColumnFamilyWithTtl( + new ColumnFamilyDescriptor("new_cf".getBytes()), 1)) { ttlDB.put(columnFamilyHandle, "key".getBytes(), "value".getBytes()); assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())). @@ -151,16 +115,6 @@ public void createTtlColumnFamily() throws RocksDBException, TimeUnit.SECONDS.sleep(2); ttlDB.compactRange(columnFamilyHandle); assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull(); - } finally { - if (columnFamilyHandle != null) { - columnFamilyHandle.dispose(); - } - if (ttlDB != null) { - ttlDB.close(); - } - if (options != null) { - options.dispose(); - } } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/Types.java b/external/rocksdb/java/src/test/java/org/rocksdb/Types.java index 5ad35f463b..ca5feb4cb1 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/Types.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/Types.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java index b09cc92599..35c63f2af1 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,28 +23,26 @@ public class WriteBatchHandlerTest { @Test public void writeBatchHandler() throws IOException, RocksDBException { - WriteBatch batch = null; - CapturingWriteBatchHandler handler = null; - try { - // setup test data - final List>> testEvents = new ArrayList<>(); - testEvents.add(new Tuple<>(Action.DELETE, - new Tuple("k0".getBytes(), null))); - testEvents.add(new Tuple<>(Action.PUT, - new Tuple<>("k1".getBytes(), "v1".getBytes()))); - testEvents.add(new Tuple<>(Action.PUT, - new Tuple<>("k2".getBytes(), "v2".getBytes()))); - testEvents.add(new Tuple<>(Action.PUT, - new Tuple<>("k3".getBytes(), "v3".getBytes()))); - testEvents.add(new Tuple<>(Action.LOG, - new Tuple(null, "log1".getBytes()))); - testEvents.add(new Tuple<>(Action.MERGE, - new Tuple<>("k2".getBytes(), "v22".getBytes()))); - testEvents.add(new Tuple<>(Action.DELETE, - new Tuple("k3".getBytes(), null))); - - // load test data to the write batch - batch = new WriteBatch(); + // setup test data + final List>> testEvents = Arrays.asList( + new Tuple<>(Action.DELETE, + new Tuple("k0".getBytes(), null)), + new Tuple<>(Action.PUT, + new Tuple<>("k1".getBytes(), "v1".getBytes())), + new Tuple<>(Action.PUT, + new Tuple<>("k2".getBytes(), "v2".getBytes())), + new Tuple<>(Action.PUT, + new Tuple<>("k3".getBytes(), "v3".getBytes())), + new Tuple<>(Action.LOG, + new Tuple(null, "log1".getBytes())), + new Tuple<>(Action.MERGE, + new Tuple<>("k2".getBytes(), "v22".getBytes())), + new Tuple<>(Action.DELETE, + new Tuple("k3".getBytes(), null)) + ); + + // load test data to the write batch + try (final WriteBatch batch = new WriteBatch()) { for (final Tuple> testEvent : testEvents) { final Tuple data = testEvent.value; switch (testEvent.key) { @@ -67,29 +65,27 @@ public void writeBatchHandler() throws IOException, RocksDBException { } } - // attempt to read test data back from the WriteBatch by iterating with a handler - handler = new CapturingWriteBatchHandler(); - batch.iterate(handler); + // attempt to read test data back from the WriteBatch by iterating + // with a handler + try (final CapturingWriteBatchHandler handler = + new CapturingWriteBatchHandler()) { + batch.iterate(handler); - // compare the results to the test data - final List>> actualEvents = handler.getEvents(); - assertThat(testEvents.size()).isSameAs(actualEvents.size()); + // compare the results to the test data + final List>> actualEvents = + handler.getEvents(); + assertThat(testEvents.size()).isSameAs(actualEvents.size()); - for (int i = 0; i < testEvents.size(); i++) { - assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue(); - } - } finally { - if (handler != null) { - handler.dispose(); - } - if (batch != null) { - batch.dispose(); + for (int i = 0; i < testEvents.size(); i++) { + assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue(); + } } } } - private static boolean equals(final Tuple> expected, - final Tuple> actual) { + private static boolean equals( + final Tuple> expected, + final Tuple> actual) { if (!expected.key.equals(actual.key)) { return false; } @@ -136,7 +132,8 @@ private enum Action { */ private static class CapturingWriteBatchHandler extends WriteBatch.Handler { - private final List>> events = new ArrayList<>(); + private final List>> events + = new ArrayList<>(); /** * Returns a copy of the current events list @@ -159,12 +156,14 @@ public void merge(final byte[] key, final byte[] value) { @Override public void delete(final byte[] key) { - events.add(new Tuple<>(Action.DELETE, new Tuple(key, null))); + events.add(new Tuple<>(Action.DELETE, + new Tuple(key, null))); } @Override public void logData(final byte[] blob) { - events.add(new Tuple<>(Action.LOG, new Tuple(null, blob))); + events.add(new Tuple<>(Action.LOG, + new Tuple(null, blob))); } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java index 89a9d5405a..ba5d003970 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,15 +14,16 @@ import org.junit.rules.TemporaryFolder; import java.io.UnsupportedEncodingException; +import java.util.Arrays; import static org.assertj.core.api.Assertions.assertThat; /** * This class mimics the db/write_batch_test.cc * in the c++ rocksdb library. - * + *

        * Not ported yet: - * + *

        * Continue(); * PutGatherSlices(); */ @@ -36,80 +37,207 @@ public class WriteBatchTest { @Test public void emptyWriteBatch() { - WriteBatch batch = new WriteBatch(); - assertThat(batch.count()).isEqualTo(0); + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.count()).isEqualTo(0); + } } @Test public void multipleBatchOperations() throws UnsupportedEncodingException { - WriteBatch batch = new WriteBatch(); - batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); - batch.remove("box".getBytes("US-ASCII")); - batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); - WriteBatchTestInternalHelper.setSequence(batch, 100); - assertThat(WriteBatchTestInternalHelper.sequence(batch)). - isNotNull(). - isEqualTo(100); - assertThat(batch.count()).isEqualTo(3); - assertThat(new String(getContents(batch), "US-ASCII")). - isEqualTo("Put(baz, boo)@102" + - "Delete(box)@101" + - "Put(foo, bar)@100"); + try (WriteBatch batch = new WriteBatch()) { + batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + batch.remove("box".getBytes("US-ASCII")); + batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); + + WriteBatchTestInternalHelper.setSequence(batch, 100); + assertThat(WriteBatchTestInternalHelper.sequence(batch)). + isNotNull(). + isEqualTo(100); + assertThat(batch.count()).isEqualTo(3); + assertThat(new String(getContents(batch), "US-ASCII")). + isEqualTo("Put(baz, boo)@102" + + "Delete(box)@101" + + "Put(foo, bar)@100"); + } } @Test public void testAppendOperation() throws UnsupportedEncodingException { - WriteBatch b1 = new WriteBatch(); - WriteBatch b2 = new WriteBatch(); - WriteBatchTestInternalHelper.setSequence(b1, 200); - WriteBatchTestInternalHelper.setSequence(b2, 300); - WriteBatchTestInternalHelper.append(b1, b2); - assertThat(getContents(b1).length).isEqualTo(0); - assertThat(b1.count()).isEqualTo(0); - b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); - WriteBatchTestInternalHelper.append(b1, b2); - assertThat("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII"))); - assertThat(b1.count()).isEqualTo(1); - b2.clear(); - b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); - WriteBatchTestInternalHelper.append(b1, b2); - assertThat(("Put(a, va)@200" + - "Put(b, vb)@201") - .equals(new String(getContents(b1), "US-ASCII"))); - assertThat(b1.count()).isEqualTo(2); - b2.remove("foo".getBytes("US-ASCII")); - WriteBatchTestInternalHelper.append(b1, b2); - assertThat(("Put(a, va)@200" + - "Put(b, vb)@202" + - "Put(b, vb)@201" + - "Delete(foo)@203") - .equals(new String(getContents(b1), "US-ASCII"))); - assertThat(b1.count()).isEqualTo(4); + try (final WriteBatch b1 = new WriteBatch(); + final WriteBatch b2 = new WriteBatch()) { + WriteBatchTestInternalHelper.setSequence(b1, 200); + WriteBatchTestInternalHelper.setSequence(b2, 300); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat(getContents(b1).length).isEqualTo(0); + assertThat(b1.count()).isEqualTo(0); + b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat("Put(a, va)@200".equals(new String(getContents(b1), + "US-ASCII"))); + assertThat(b1.count()).isEqualTo(1); + b2.clear(); + b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat(("Put(a, va)@200" + + "Put(b, vb)@201") + .equals(new String(getContents(b1), "US-ASCII"))); + assertThat(b1.count()).isEqualTo(2); + b2.remove("foo".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat(("Put(a, va)@200" + + "Put(b, vb)@202" + + "Put(b, vb)@201" + + "Delete(foo)@203") + .equals(new String(getContents(b1), "US-ASCII"))); + assertThat(b1.count()).isEqualTo(4); + } } @Test public void blobOperation() throws UnsupportedEncodingException { - WriteBatch batch = new WriteBatch(); - batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); - batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); - batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); - batch.putLogData("blob1".getBytes("US-ASCII")); - batch.remove("k2".getBytes("US-ASCII")); - batch.putLogData("blob2".getBytes("US-ASCII")); - batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); - assertThat(batch.count()).isEqualTo(5); - assertThat(("Merge(foo, bar)@4" + - "Put(k1, v1)@0" + - "Delete(k2)@3" + - "Put(k2, v2)@1" + - "Put(k3, v3)@2") - .equals(new String(getContents(batch), "US-ASCII"))); + try (final WriteBatch batch = new WriteBatch()) { + batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); + batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); + batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); + batch.putLogData("blob1".getBytes("US-ASCII")); + batch.remove("k2".getBytes("US-ASCII")); + batch.putLogData("blob2".getBytes("US-ASCII")); + batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + assertThat(batch.count()).isEqualTo(5); + assertThat(("Merge(foo, bar)@4" + + "Put(k1, v1)@0" + + "Delete(k2)@3" + + "Put(k2, v2)@1" + + "Put(k3, v3)@2") + .equals(new String(getContents(batch), "US-ASCII"))); + } + } + + @Test + public void savePoints() + throws UnsupportedEncodingException, RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); + batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); + batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); + + assertThat(getFromWriteBatch(batch, "k1")).isEqualTo("v1"); + assertThat(getFromWriteBatch(batch, "k2")).isEqualTo("v2"); + assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3"); + + + batch.setSavePoint(); + + batch.remove("k2".getBytes("US-ASCII")); + batch.put("k3".getBytes("US-ASCII"), "v3-2".getBytes("US-ASCII")); + + assertThat(getFromWriteBatch(batch, "k2")).isNull(); + assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-2"); + + + batch.setSavePoint(); + + batch.put("k3".getBytes("US-ASCII"), "v3-3".getBytes("US-ASCII")); + batch.put("k4".getBytes("US-ASCII"), "v4".getBytes("US-ASCII")); + + assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-3"); + assertThat(getFromWriteBatch(batch, "k4")).isEqualTo("v4"); + + + batch.rollbackToSavePoint(); + + assertThat(getFromWriteBatch(batch, "k2")).isNull(); + assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-2"); + assertThat(getFromWriteBatch(batch, "k4")).isNull(); + + + batch.rollbackToSavePoint(); + + assertThat(getFromWriteBatch(batch, "k1")).isEqualTo("v1"); + assertThat(getFromWriteBatch(batch, "k2")).isEqualTo("v2"); + assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3"); + assertThat(getFromWriteBatch(batch, "k4")).isNull(); + } + } + + @Test(expected = RocksDBException.class) + public void restorePoints_withoutSavePoints() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + batch.rollbackToSavePoint(); + } + } + + @Test(expected = RocksDBException.class) + public void restorePoints_withoutSavePoints_nested() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + + batch.setSavePoint(); + batch.rollbackToSavePoint(); + + // without previous corresponding setSavePoint + batch.rollbackToSavePoint(); + } + } + + static byte[] getContents(final WriteBatch wb) { + return getContents(wb.nativeHandle_); + } + + static String getFromWriteBatch(final WriteBatch wb, final String key) + throws RocksDBException, UnsupportedEncodingException { + final WriteBatchGetter getter = + new WriteBatchGetter(key.getBytes("US-ASCII")); + wb.iterate(getter); + if(getter.getValue() != null) { + return new String(getter.getValue(), "US-ASCII"); + } else { + return null; + } } - static native byte[] getContents(WriteBatch batch); + private static native byte[] getContents(final long writeBatchHandle); + + private static class WriteBatchGetter extends WriteBatch.Handler { + + private final byte[] key; + private byte[] value; + + public WriteBatchGetter(final byte[] key) { + this.key = key; + } + + public byte[] getValue() { + return value; + } + + @Override + public void put(final byte[] key, final byte[] value) { + if(Arrays.equals(this.key, key)) { + this.value = value; + } + } + + @Override + public void merge(final byte[] key, final byte[] value) { + if(Arrays.equals(this.key, key)) { + throw new UnsupportedOperationException(); + } + } + + @Override + public void delete(final byte[] key) { + if(Arrays.equals(this.key, key)) { + this.value = null; + } + } + + @Override + public void logData(final byte[] blob) { + } + } } /** @@ -117,7 +245,23 @@ public void blobOperation() * c++ WriteBatchInternal. */ class WriteBatchTestInternalHelper { - static native void setSequence(WriteBatch batch, long sn); - static native long sequence(WriteBatch batch); - static native void append(WriteBatch b1, WriteBatch b2); + static void setSequence(final WriteBatch wb, final long sn) { + setSequence(wb.nativeHandle_, sn); + } + + static long sequence(final WriteBatch wb) { + return sequence(wb.nativeHandle_); + } + + static void append(final WriteBatch wb1, final WriteBatch wb2) { + append(wb1.nativeHandle_, wb2.nativeHandle_); + } + + private static native void setSequence(final long writeBatchHandle, + final long sn); + + private static native long sequence(final long writeBatchHandle); + + private static native void append(final long writeBatchHandle1, + final long writeBatchHandle2); } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java new file mode 100644 index 0000000000..66e1c8966b --- /dev/null +++ b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java @@ -0,0 +1,104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; +import org.junit.runners.Parameterized.Parameters; + +import java.nio.ByteBuffer; +import java.util.*; +import java.util.concurrent.*; + +@RunWith(Parameterized.class) +public class WriteBatchThreadedTest { + + @Parameters(name = "WriteBatchThreadedTest(threadCount={0})") + public static Iterable data() { + return Arrays.asList(new Integer[]{1, 10, 50, 100}); + } + + @Parameter + public int threadCount; + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + RocksDB db; + + @Before + public void setUp() throws Exception { + RocksDB.loadLibrary(); + final Options options = new Options() + .setCreateIfMissing(true) + .setIncreaseParallelism(32); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + assert (db != null); + } + + @After + public void tearDown() throws Exception { + if (db != null) { + db.close(); + } + } + + @Test + public void threadedWrites() throws InterruptedException, ExecutionException { + final List> callables = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + final int offset = i * 100; + callables.add(new Callable() { + @Override + public Void call() throws RocksDBException { + final WriteBatch wb = new WriteBatch(); + for (int i = offset; i < offset + 100; i++) { + wb.put(ByteBuffer.allocate(4).putInt(i).array(), + "parallel rocks test".getBytes()); + } + db.write(new WriteOptions(), wb); + + return null; + } + }); + } + + //submit the callables + final ExecutorService executorService = + Executors.newFixedThreadPool(threadCount); + try { + final ExecutorCompletionService completionService = + new ExecutorCompletionService<>(executorService); + final Set> futures = new HashSet<>(); + for (final Callable callable : callables) { + futures.add(completionService.submit(callable)); + } + + while (futures.size() > 0) { + final Future future = completionService.take(); + futures.remove(future); + + try { + future.get(); + } catch (final ExecutionException e) { + for (final Future f : futures) { + f.cancel(true); + } + + throw e; + } + } + } finally { + executorService.shutdown(); + executorService.awaitTermination(10, TimeUnit.SECONDS); + } + } +} diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java index b0c729a583..726c6f2915 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,9 +14,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; +import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; -import java.util.ArrayDeque; -import java.util.Deque; +import java.util.Arrays; import static org.assertj.core.api.Assertions.assertThat; @@ -32,13 +32,9 @@ public class WriteBatchWithIndexTest { @Test public void readYourOwnWrites() throws RocksDBException { - RocksDB db = null; - Options options = null; - try { - options = new Options(); - // Setup options - options.setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { final byte[] k1 = "key1".getBytes(); final byte[] v1 = "value1".getBytes(); @@ -48,13 +44,9 @@ public void readYourOwnWrites() throws RocksDBException { db.put(k1, v1); db.put(k2, v2); - final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); - - RocksIterator base = null; - RocksIterator it = null; - try { - base = db.newIterator(); - it = wbwi.newIteratorWithBase(base); + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final RocksIterator base = db.newIterator(); + final RocksIterator it = wbwi.newIteratorWithBase(base)) { it.seek(k1); assertThat(it.isValid()).isTrue(); @@ -95,171 +87,226 @@ public void readYourOwnWrites() throws RocksDBException { assertThat(it.isValid()).isTrue(); assertThat(it.key()).isEqualTo(k1); assertThat(it.value()).isEqualTo(v1Other); - } finally { - if (it != null) { - it.dispose(); - } - if (base != null) { - base.dispose(); - } - } - - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); } } } @Test public void write_writeBatchWithIndex() throws RocksDBException { - RocksDB db = null; - Options options = null; - try { - options = new Options(); - // Setup options - options.setCreateIfMissing(true); - db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { final byte[] k1 = "key1".getBytes(); final byte[] v1 = "value1".getBytes(); final byte[] k2 = "key2".getBytes(); final byte[] v2 = "value2".getBytes(); - WriteBatchWithIndex wbwi = null; - - try { - wbwi = new WriteBatchWithIndex(); - - + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { wbwi.put(k1, v1); wbwi.put(k2, v2); db.write(new WriteOptions(), wbwi); - } finally { - if(wbwi != null) { - wbwi.dispose(); - } } assertThat(db.get(k1)).isEqualTo(v1); assertThat(db.get(k2)).isEqualTo(v2); - - } finally { - if (db != null) { - db.close(); - } - if (options != null) { - options.dispose(); - } } } @Test public void iterator() throws RocksDBException { - final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); - - final String k1 = "key1"; - final String v1 = "value1"; - final String k2 = "key2"; - final String v2 = "value2"; - final String k3 = "key3"; - final String v3 = "value3"; - final byte[] k1b = k1.getBytes(); - final byte[] v1b = v1.getBytes(); - final byte[] k2b = k2.getBytes(); - final byte[] v2b = v2.getBytes(); - final byte[] k3b = k3.getBytes(); - final byte[] v3b = v3.getBytes(); - - //add put records - wbwi.put(k1b, v1b); - wbwi.put(k2b, v2b); - wbwi.put(k3b, v3b); - - //add a deletion record - final String k4 = "key4"; - final byte[] k4b = k4.getBytes(); - wbwi.remove(k4b); - - WBWIRocksIterator.WriteEntry[] expected = { - new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, - new DirectSlice(k1), new DirectSlice(v1)), - new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, - new DirectSlice(k2), new DirectSlice(v2)), - new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, - new DirectSlice(k3), new DirectSlice(v3)), - new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE, - new DirectSlice(k4), DirectSlice.NONE) - }; - - WBWIRocksIterator it = null; - try { - it = wbwi.newIterator(); - - //direct access - seek to key offsets - final int[] testOffsets = {2, 0, 1, 3}; - - for(int i = 0; i < testOffsets.length; i++) { - final int testOffset = testOffsets[i]; - final byte[] key = toArray(expected[testOffset].getKey().data()); - - it.seek(key); - assertThat(it.isValid()).isTrue(); - assertThat(it.entry().equals(expected[testOffset])).isTrue(); - } + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true)) { + + final String k1 = "key1"; + final String v1 = "value1"; + final String k2 = "key2"; + final String v2 = "value2"; + final String k3 = "key3"; + final String v3 = "value3"; + final byte[] k1b = k1.getBytes(); + final byte[] v1b = v1.getBytes(); + final byte[] k2b = k2.getBytes(); + final byte[] v2b = v2.getBytes(); + final byte[] k3b = k3.getBytes(); + final byte[] v3b = v3.getBytes(); + + //add put records + wbwi.put(k1b, v1b); + wbwi.put(k2b, v2b); + wbwi.put(k3b, v3b); + + //add a deletion record + final String k4 = "key4"; + final byte[] k4b = k4.getBytes(); + wbwi.remove(k4b); + + final WBWIRocksIterator.WriteEntry[] expected = { + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(k1), new DirectSlice(v1)), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(k2), new DirectSlice(v2)), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(k3), new DirectSlice(v3)), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE, + new DirectSlice(k4), DirectSlice.NONE) + }; + + try (final WBWIRocksIterator it = wbwi.newIterator()) { + //direct access - seek to key offsets + final int[] testOffsets = {2, 0, 1, 3}; + + for (int i = 0; i < testOffsets.length; i++) { + final int testOffset = testOffsets[i]; + final byte[] key = toArray(expected[testOffset].getKey().data()); + + it.seek(key); + assertThat(it.isValid()).isTrue(); + + final WBWIRocksIterator.WriteEntry entry = it.entry(); + assertThat(entry.equals(expected[testOffset])).isTrue(); + } - //forward iterative access - int i = 0; - for(it.seekToFirst(); it.isValid(); it.next()) { - assertThat(it.entry().equals(expected[i++])).isTrue(); - } + //forward iterative access + int i = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + assertThat(it.entry().equals(expected[i++])).isTrue(); + } - //reverse iterative access - i = expected.length - 1; - for(it.seekToLast(); it.isValid(); it.prev()) { - assertThat(it.entry().equals(expected[i--])).isTrue(); + //reverse iterative access + i = expected.length - 1; + for (it.seekToLast(); it.isValid(); it.prev()) { + assertThat(it.entry().equals(expected[i--])).isTrue(); + } } + } + } - } finally { - if(it != null) { - it.dispose(); + @Test + public void zeroByteTests() { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true)) { + final byte[] zeroByteValue = new byte[]{0, 0}; + //add zero byte value + wbwi.put(zeroByteValue, zeroByteValue); + + final ByteBuffer buffer = ByteBuffer.allocateDirect(zeroByteValue.length); + buffer.put(zeroByteValue); + + WBWIRocksIterator.WriteEntry[] expected = { + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(buffer, zeroByteValue.length), + new DirectSlice(buffer, zeroByteValue.length)) + }; + + try (final WBWIRocksIterator it = wbwi.newIterator()) { + it.seekToFirst(); + assertThat(it.entry().equals(expected[0])).isTrue(); + assertThat(it.entry().hashCode() == expected[0].hashCode()).isTrue(); } } } @Test - public void zeroByteTests() { - final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); - byte[] zeroByteValue = new byte[] { 0, 0 }; - - //add zero byte value - wbwi.put(zeroByteValue, zeroByteValue); - - ByteBuffer buffer = ByteBuffer.allocateDirect(zeroByteValue.length); - buffer.put(zeroByteValue); - - WBWIRocksIterator.WriteEntry[] expected = { - new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, - new DirectSlice(buffer, zeroByteValue.length), - new DirectSlice(buffer, zeroByteValue.length)) - }; - WBWIRocksIterator it = null; - try { - it = wbwi.newIterator(); - it.seekToFirst(); - assertThat(it.entry().equals(expected[0])).isTrue(); - assertThat(it.entry().hashCode() == expected[0].hashCode()).isTrue(); - } finally { - if(it != null) { - it.dispose(); + public void savePoints() + throws UnsupportedEncodingException, RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final ReadOptions readOptions = new ReadOptions()) { + wbwi.put("k1".getBytes(), "v1".getBytes()); + wbwi.put("k2".getBytes(), "v2".getBytes()); + wbwi.put("k3".getBytes(), "v3".getBytes()); + + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k1")) + .isEqualTo("v1"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2")) + .isEqualTo("v2"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3")) + .isEqualTo("v3"); + + + wbwi.setSavePoint(); + + wbwi.remove("k2".getBytes()); + wbwi.put("k3".getBytes(), "v3-2".getBytes()); + + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2")) + .isNull(); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3")) + .isEqualTo("v3-2"); + + + wbwi.setSavePoint(); + + wbwi.put("k3".getBytes(), "v3-3".getBytes()); + wbwi.put("k4".getBytes(), "v4".getBytes()); + + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3")) + .isEqualTo("v3-3"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k4")) + .isEqualTo("v4"); + + + wbwi.rollbackToSavePoint(); + + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2")) + .isNull(); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3")) + .isEqualTo("v3-2"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k4")) + .isNull(); + + + wbwi.rollbackToSavePoint(); + + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k1")) + .isEqualTo("v1"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2")) + .isEqualTo("v2"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3")) + .isEqualTo("v3"); + assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k4")) + .isNull(); } } } + @Test(expected = RocksDBException.class) + public void restorePoints_withoutSavePoints() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + wbwi.rollbackToSavePoint(); + } + } + + @Test(expected = RocksDBException.class) + public void restorePoints_withoutSavePoints_nested() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + + wbwi.setSavePoint(); + wbwi.rollbackToSavePoint(); + + // without previous corresponding setSavePoint + wbwi.rollbackToSavePoint(); + } + } + + private static String getFromWriteBatchWithIndex(final RocksDB db, + final ReadOptions readOptions, final WriteBatchWithIndex wbwi, + final String skey) { + final byte[] key = skey.getBytes(); + try(final RocksIterator baseIterator = db.newIterator(readOptions); + final RocksIterator iterator = wbwi.newIteratorWithBase(baseIterator)) { + iterator.seek(key); + + // Arrays.equals(key, iterator.key()) ensures an exact match in Rocks, + // instead of a nearest match + return iterator.isValid() && + Arrays.equals(key, iterator.key()) ? + new String(iterator.value()) : null; + } + } + private byte[] toArray(final ByteBuffer buf) { final byte[] ary = new byte[buf.remaining()]; buf.get(ary); diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java index 4d8e6d97e1..c6af5c8185 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,15 +17,16 @@ public class WriteOptionsTest { new RocksMemoryResource(); @Test - public void writeOptions(){ - WriteOptions writeOptions = new WriteOptions(); - writeOptions.setDisableWAL(true); - assertThat(writeOptions.disableWAL()).isTrue(); - writeOptions.setDisableWAL(false); - assertThat(writeOptions.disableWAL()).isFalse(); - writeOptions.setSync(true); - assertThat(writeOptions.sync()).isTrue(); - writeOptions.setSync(false); - assertThat(writeOptions.sync()).isFalse(); + public void writeOptions() { + try (final WriteOptions writeOptions = new WriteOptions()) { + writeOptions.setDisableWAL(true); + assertThat(writeOptions.disableWAL()).isTrue(); + writeOptions.setDisableWAL(false); + assertThat(writeOptions.disableWAL()).isFalse(); + writeOptions.setSync(true); + assertThat(writeOptions.sync()).isTrue(); + writeOptions.setSync(false); + assertThat(writeOptions.sync()).isFalse(); + } } } diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/external/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java index c800574f5d..044f96b941 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java new file mode 100644 index 0000000000..01ea52c49c --- /dev/null +++ b/external/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java @@ -0,0 +1,480 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.util; + +import org.junit.Test; +import org.rocksdb.*; +import org.rocksdb.Comparator; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.*; + +import static org.junit.Assert.*; + +/** + * This is a direct port of various C++ + * tests from db/comparator_db_test.cc + * and some code to adapt it to RocksJava + */ +public class BytewiseComparatorTest { + + /** + * Open the database using the C++ BytewiseComparatorImpl + * and test the results against our Java BytewiseComparator + */ + @Test + public void java_vs_cpp_bytewiseComparator() + throws IOException, RocksDBException { + for(int rand_seed = 301; rand_seed < 306; rand_seed++) { + final Path dbDir = Files.createTempDirectory("comparator_db_test"); + try(final RocksDB db = openDatabase(dbDir, + BuiltinComparator.BYTEWISE_COMPARATOR)) { + final Random rnd = new Random(rand_seed); + doRandomIterationTest( + db, + toJavaComparator(new BytewiseComparator(new ComparatorOptions())), + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"), + rnd, + 8, 100, 3 + ); + } finally { + removeData(dbDir); + } + } + } + + /** + * Open the database using the Java BytewiseComparator + * and test the results against another Java BytewiseComparator + */ + @Test + public void java_vs_java_bytewiseComparator() + throws IOException, RocksDBException { + for(int rand_seed = 301; rand_seed < 306; rand_seed++) { + final Path dbDir = Files.createTempDirectory("comparator_db_test"); + try(final RocksDB db = openDatabase(dbDir, new BytewiseComparator( + new ComparatorOptions()))) { + final Random rnd = new Random(rand_seed); + doRandomIterationTest( + db, + toJavaComparator(new BytewiseComparator(new ComparatorOptions())), + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"), + rnd, + 8, 100, 3 + ); + } finally { + removeData(dbDir); + } + } + } + + /** + * Open the database using the C++ BytewiseComparatorImpl + * and test the results against our Java DirectBytewiseComparator + */ + @Test + public void java_vs_cpp_directBytewiseComparator() + throws IOException, RocksDBException { + for(int rand_seed = 301; rand_seed < 306; rand_seed++) { + final Path dbDir = Files.createTempDirectory("comparator_db_test"); + try(final RocksDB db = openDatabase(dbDir, + BuiltinComparator.BYTEWISE_COMPARATOR)) { + final Random rnd = new Random(rand_seed); + doRandomIterationTest( + db, + toJavaComparator(new DirectBytewiseComparator( + new ComparatorOptions()) + ), + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"), + rnd, + 8, 100, 3 + ); + } finally { + removeData(dbDir); + } + } + } + + /** + * Open the database using the Java DirectBytewiseComparator + * and test the results against another Java DirectBytewiseComparator + */ + @Test + public void java_vs_java_directBytewiseComparator() + throws IOException, RocksDBException { + for(int rand_seed = 301; rand_seed < 306; rand_seed++) { + final Path dbDir = Files.createTempDirectory("comparator_db_test"); + try(final RocksDB db = openDatabase(dbDir, new DirectBytewiseComparator( + new ComparatorOptions()))) { + final Random rnd = new Random(rand_seed); + doRandomIterationTest( + db, + toJavaComparator(new DirectBytewiseComparator( + new ComparatorOptions()) + ), + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"), + rnd, + 8, 100, 3 + ); + } finally { + removeData(dbDir); + } + } + } + + /** + * Open the database using the C++ ReverseBytewiseComparatorImpl + * and test the results against our Java ReverseBytewiseComparator + */ + @Test + public void java_vs_cpp_reverseBytewiseComparator() + throws IOException, RocksDBException { + for(int rand_seed = 301; rand_seed < 306; rand_seed++) { + final Path dbDir = Files.createTempDirectory("comparator_db_test"); + try(final RocksDB db = openDatabase(dbDir, + BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR)) { + final Random rnd = new Random(rand_seed); + doRandomIterationTest( + db, + toJavaComparator( + new ReverseBytewiseComparator(new ComparatorOptions()) + ), + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"), + rnd, + 8, 100, 3 + ); + } finally { + removeData(dbDir); + } + } + } + + /** + * Open the database using the Java ReverseBytewiseComparator + * and test the results against another Java ReverseBytewiseComparator + */ + @Test + public void java_vs_java_reverseBytewiseComparator() + throws IOException, RocksDBException { + + for(int rand_seed = 301; rand_seed < 306; rand_seed++) { + final Path dbDir = Files.createTempDirectory("comparator_db_test"); + try(final RocksDB db = openDatabase(dbDir, new ReverseBytewiseComparator( + new ComparatorOptions()))) { + final Random rnd = new Random(rand_seed); + doRandomIterationTest( + db, + toJavaComparator( + new ReverseBytewiseComparator(new ComparatorOptions()) + ), + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"), + rnd, + 8, 100, 3 + ); + } finally { + removeData(dbDir); + } + } + } + + private void doRandomIterationTest( + final RocksDB db, final java.util.Comparator javaComparator, + final List source_strings, final Random rnd, + final int num_writes, final int num_iter_ops, + final int num_trigger_flush) throws RocksDBException { + + final TreeMap map = new TreeMap<>(javaComparator); + + for (int i = 0; i < num_writes; i++) { + if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) { + db.flush(new FlushOptions()); + } + + final int type = rnd.nextInt(2); + final int index = rnd.nextInt(source_strings.size()); + final String key = source_strings.get(index); + switch (type) { + case 0: + // put + map.put(key, key); + db.put(new WriteOptions(), bytes(key), bytes(key)); + break; + case 1: + // delete + if (map.containsKey(key)) { + map.remove(key); + } + db.remove(new WriteOptions(), bytes(key)); + break; + + default: + fail("Should not be able to generate random outside range 1..2"); + } + } + + try(final RocksIterator iter = db.newIterator(new ReadOptions())) { + final KVIter result_iter = new KVIter(map); + + boolean is_valid = false; + for (int i = 0; i < num_iter_ops; i++) { + // Random walk and make sure iter and result_iter returns the + // same key and value + final int type = rnd.nextInt(6); + iter.status(); + switch (type) { + case 0: + // Seek to First + iter.seekToFirst(); + result_iter.seekToFirst(); + break; + case 1: + // Seek to last + iter.seekToLast(); + result_iter.seekToLast(); + break; + case 2: { + // Seek to random key + final int key_idx = rnd.nextInt(source_strings.size()); + final String key = source_strings.get(key_idx); + iter.seek(bytes(key)); + result_iter.seek(bytes(key)); + break; + } + case 3: + // Next + if (is_valid) { + iter.next(); + result_iter.next(); + } else { + continue; + } + break; + case 4: + // Prev + if (is_valid) { + iter.prev(); + result_iter.prev(); + } else { + continue; + } + break; + default: { + assert (type == 5); + final int key_idx = rnd.nextInt(source_strings.size()); + final String key = source_strings.get(key_idx); + final byte[] result = db.get(new ReadOptions(), bytes(key)); + if (!map.containsKey(key)) { + assertNull(result); + } else { + assertArrayEquals(bytes(map.get(key)), result); + } + break; + } + } + + assertEquals(result_iter.isValid(), iter.isValid()); + + is_valid = iter.isValid(); + + if (is_valid) { + assertArrayEquals(bytes(result_iter.key()), iter.key()); + + //note that calling value on a non-valid iterator from the Java API + //results in a SIGSEGV + assertArrayEquals(bytes(result_iter.value()), iter.value()); + } + } + } + } + + /** + * Open the database using a C++ Comparator + */ + private RocksDB openDatabase( + final Path dbDir, final BuiltinComparator cppComparator) + throws IOException, RocksDBException { + final Options options = new Options() + .setCreateIfMissing(true) + .setComparator(cppComparator); + return RocksDB.open(options, dbDir.toAbsolutePath().toString()); + } + + /** + * Open the database using a Java Comparator + */ + private RocksDB openDatabase( + final Path dbDir, + final AbstractComparator> javaComparator) + throws IOException, RocksDBException { + final Options options = new Options() + .setCreateIfMissing(true) + .setComparator(javaComparator); + return RocksDB.open(options, dbDir.toAbsolutePath().toString()); + } + + private void closeDatabase(final RocksDB db) { + db.close(); + } + + private void removeData(final Path dbDir) throws IOException { + Files.walkFileTree(dbDir, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile( + final Path file, final BasicFileAttributes attrs) + throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory( + final Path dir, final IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + + private byte[] bytes(final String s) { + return s.getBytes(StandardCharsets.UTF_8); + } + + private java.util.Comparator toJavaComparator( + final Comparator rocksComparator) { + return new java.util.Comparator() { + @Override + public int compare(final String s1, final String s2) { + return rocksComparator.compare(new Slice(s1), new Slice(s2)); + } + }; + } + + private java.util.Comparator toJavaComparator( + final DirectComparator rocksComparator) { + return new java.util.Comparator() { + @Override + public int compare(final String s1, final String s2) { + return rocksComparator.compare(new DirectSlice(s1), + new DirectSlice(s2)); + } + }; + } + + private class KVIter implements RocksIteratorInterface { + + private final List> entries; + private final java.util.Comparator comparator; + private int offset = -1; + + private int lastPrefixMatchIdx = -1; + private int lastPrefixMatch = 0; + + public KVIter(final TreeMap map) { + this.entries = new ArrayList<>(); + final Iterator> iterator = map.entrySet().iterator(); + while(iterator.hasNext()) { + entries.add(iterator.next()); + } + this.comparator = map.comparator(); + } + + + @Override + public boolean isValid() { + return offset > -1 && offset < entries.size(); + } + + @Override + public void seekToFirst() { + offset = 0; + } + + @Override + public void seekToLast() { + offset = entries.size() - 1; + } + + @Override + public void seek(final byte[] target) { + for(offset = 0; offset < entries.size(); offset++) { + if(comparator.compare(entries.get(offset).getKey(), + (K)new String(target, StandardCharsets.UTF_8)) >= 0) { + return; + } + } + } + + /** + * Is `a` a prefix of `b` + * + * @return The length of the matching prefix, or 0 if it is not a prefix + */ + private int isPrefix(final byte[] a, final byte[] b) { + if(b.length >= a.length) { + for(int i = 0; i < a.length; i++) { + if(a[i] != b[i]) { + return i; + } + } + return a.length; + } else { + return 0; + } + } + + @Override + public void next() { + if(offset < entries.size()) { + offset++; + } + } + + @Override + public void prev() { + if(offset >= 0) { + offset--; + } + } + + @Override + public void status() throws RocksDBException { + if(offset < 0 || offset >= entries.size()) { + throw new RocksDBException("Index out of bounds. Size is: " + + entries.size() + ", offset is: " + offset); + } + } + + public K key() { + if(!isValid()) { + if(entries.isEmpty()) { + return (K)""; + } else if(offset == -1){ + return entries.get(0).getKey(); + } else if(offset == entries.size()) { + return entries.get(offset - 1).getKey(); + } else { + return (K)""; + } + } else { + return entries.get(offset).getKey(); + } + } + + public V value() { + if(!isValid()) { + return (V)""; + } else { + return entries.get(offset).getValue(); + } + } + } +} diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java index c7160deb64..2de1c45f74 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -117,16 +117,22 @@ public void detectWindows(){ assertThat(Environment.isWindows()).isTrue(); } - @Test(expected = UnsupportedOperationException.class) - public void failWinJniLibraryName(){ + @Test + public void win64() { setEnvironmentClassFields("win", "x64"); - Environment.getJniLibraryFileName("rocksdb"); + assertThat(Environment.isWindows()).isTrue(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".dll"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-win64.dll"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.dll"); } @Test(expected = UnsupportedOperationException.class) - public void failWinSharedLibrary(){ - setEnvironmentClassFields("win", "x64"); - Environment.getSharedLibraryFileName("rocksdb"); + public void win32(){ + setEnvironmentClassFields("win", "32"); + Environment.getJniLibraryFileName("rocksdb"); } private void setEnvironmentClassFields(String osName, diff --git a/external/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java b/external/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java index 517e1b2b5a..e74c041030 100644 --- a/external/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java +++ b/external/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/util/hash_cuckoo_rep.cc b/external/rocksdb/memtable/hash_cuckoo_rep.cc similarity index 98% rename from external/rocksdb/util/hash_cuckoo_rep.cc rename to external/rocksdb/memtable/hash_cuckoo_rep.cc index 6e5057a739..6ae3e098bf 100644 --- a/external/rocksdb/util/hash_cuckoo_rep.cc +++ b/external/rocksdb/memtable/hash_cuckoo_rep.cc @@ -1,12 +1,11 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #ifndef ROCKSDB_LITE - -#include "util/hash_cuckoo_rep.h" +#include "memtable/hash_cuckoo_rep.h" #include #include @@ -18,9 +17,10 @@ #include "db/memtable.h" #include "db/skiplist.h" +#include "memtable/stl_wrappers.h" +#include "port/port.h" #include "rocksdb/memtablerep.h" #include "util/murmurhash.h" -#include "util/stl_wrappers.h" namespace rocksdb { namespace { @@ -318,7 +318,7 @@ void HashCuckooRep::Get(const LookupKey& key, void* callback_args, } void HashCuckooRep::Insert(KeyHandle handle) { - static const float kMaxFullness = 0.90; + static const float kMaxFullness = 0.90f; auto* key = static_cast(handle); int initial_hash_id = 0; @@ -620,12 +620,13 @@ MemTableRep* HashCuckooRepFactory::CreateMemTableRep( // degrades as the fullness of the mem-table increases. Setting kFullness // to a value around 0.7 can better avoid write performance degradation while // keeping efficient memory usage. - static const float kFullness = 0.7; + static const float kFullness = 0.7f; size_t pointer_size = sizeof(std::atomic); assert(write_buffer_size_ >= (average_data_size_ + pointer_size)); size_t bucket_count = + static_cast( (write_buffer_size_ / (average_data_size_ + pointer_size)) / kFullness + - 1; + 1); unsigned int hash_function_count = hash_function_count_; if (hash_function_count < 2) { hash_function_count = 2; @@ -635,7 +636,9 @@ MemTableRep* HashCuckooRepFactory::CreateMemTableRep( } return new HashCuckooRep(compare, allocator, bucket_count, hash_function_count, - (average_data_size_ + pointer_size) / kFullness); + static_cast( + (average_data_size_ + pointer_size) / kFullness) + ); } MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size, diff --git a/external/rocksdb/util/hash_cuckoo_rep.h b/external/rocksdb/memtable/hash_cuckoo_rep.h similarity index 94% rename from external/rocksdb/util/hash_cuckoo_rep.h rename to external/rocksdb/memtable/hash_cuckoo_rep.h index 9f374a978d..173a907b4e 100644 --- a/external/rocksdb/util/hash_cuckoo_rep.h +++ b/external/rocksdb/memtable/hash_cuckoo_rep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,8 +6,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #pragma once +#ifndef ROCKSDB_LITE +#include "port/port.h" #include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" diff --git a/external/rocksdb/util/hash_linklist_rep.cc b/external/rocksdb/memtable/hash_linklist_rep.cc similarity index 99% rename from external/rocksdb/util/hash_linklist_rep.cc rename to external/rocksdb/memtable/hash_linklist_rep.cc index 1e6eadfe6c..902c30e8ac 100644 --- a/external/rocksdb/util/hash_linklist_rep.cc +++ b/external/rocksdb/memtable/hash_linklist_rep.cc @@ -1,11 +1,11 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #ifndef ROCKSDB_LITE -#include "util/hash_linklist_rep.h" +#include "memtable/hash_linklist_rep.h" #include #include diff --git a/external/rocksdb/util/hash_linklist_rep.h b/external/rocksdb/memtable/hash_linklist_rep.h similarity index 96% rename from external/rocksdb/util/hash_linklist_rep.h rename to external/rocksdb/memtable/hash_linklist_rep.h index 629272394d..5197e7cfbb 100644 --- a/external/rocksdb/util/hash_linklist_rep.h +++ b/external/rocksdb/memtable/hash_linklist_rep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,8 +6,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #pragma once +#ifndef ROCKSDB_LITE #include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" diff --git a/external/rocksdb/util/hash_skiplist_rep.cc b/external/rocksdb/memtable/hash_skiplist_rep.cc similarity index 99% rename from external/rocksdb/util/hash_skiplist_rep.cc rename to external/rocksdb/memtable/hash_skiplist_rep.cc index 142903d427..73a917607b 100644 --- a/external/rocksdb/util/hash_skiplist_rep.cc +++ b/external/rocksdb/memtable/hash_skiplist_rep.cc @@ -1,11 +1,11 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #ifndef ROCKSDB_LITE -#include "util/hash_skiplist_rep.h" +#include "memtable/hash_skiplist_rep.h" #include diff --git a/external/rocksdb/util/hash_skiplist_rep.h b/external/rocksdb/memtable/hash_skiplist_rep.h similarity index 95% rename from external/rocksdb/util/hash_skiplist_rep.h rename to external/rocksdb/memtable/hash_skiplist_rep.h index 15d0fc77fd..56a289c4b1 100644 --- a/external/rocksdb/util/hash_skiplist_rep.h +++ b/external/rocksdb/memtable/hash_skiplist_rep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,8 +6,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #pragma once +#ifndef ROCKSDB_LITE #include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" diff --git a/external/rocksdb/util/skiplistrep.cc b/external/rocksdb/memtable/skiplistrep.cc similarity index 90% rename from external/rocksdb/util/skiplistrep.cc rename to external/rocksdb/memtable/skiplistrep.cc index 112a7ab122..b8c90c6d6d 100644 --- a/external/rocksdb/util/skiplistrep.cc +++ b/external/rocksdb/memtable/skiplistrep.cc @@ -1,17 +1,17 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "rocksdb/memtablerep.h" +#include "db/inlineskiplist.h" #include "db/memtable.h" -#include "db/skiplist.h" +#include "rocksdb/memtablerep.h" #include "util/arena.h" namespace rocksdb { namespace { class SkipListRep : public MemTableRep { - SkipList skip_list_; + InlineSkipList skip_list_; const MemTableRep::KeyComparator& cmp_; const SliceTransform* transform_; const size_t lookahead_; @@ -25,12 +25,21 @@ class SkipListRep : public MemTableRep { transform_(transform), lookahead_(lookahead) { } + virtual KeyHandle Allocate(const size_t len, char** buf) override { + *buf = skip_list_.AllocateKey(len); + return static_cast(*buf); + } + // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. virtual void Insert(KeyHandle handle) override { skip_list_.Insert(static_cast(handle)); } + virtual void InsertConcurrently(KeyHandle handle) override { + skip_list_.InsertConcurrently(static_cast(handle)); + } + // Returns true iff an entry that compares equal to key is in the list. virtual bool Contains(const char* key) const override { return skip_list_.Contains(key); @@ -65,13 +74,14 @@ class SkipListRep : public MemTableRep { // Iteration over the contents of a skip list class Iterator : public MemTableRep::Iterator { - SkipList::Iterator iter_; + InlineSkipList::Iterator iter_; + public: // Initialize an iterator over the specified list. // The returned iterator is not valid. explicit Iterator( - const SkipList* list - ) : iter_(list) { } + const InlineSkipList* list) + : iter_(list) {} virtual ~Iterator() override { } @@ -213,8 +223,8 @@ class SkipListRep : public MemTableRep { private: const SkipListRep& rep_; - SkipList::Iterator iter_; - SkipList::Iterator prev_; + InlineSkipList::Iterator iter_; + InlineSkipList::Iterator prev_; }; virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { diff --git a/external/rocksdb/util/stl_wrappers.h b/external/rocksdb/memtable/stl_wrappers.h similarity index 68% rename from external/rocksdb/util/stl_wrappers.h rename to external/rocksdb/memtable/stl_wrappers.h index 15b9bdf529..a431330171 100644 --- a/external/rocksdb/util/stl_wrappers.h +++ b/external/rocksdb/memtable/stl_wrappers.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -30,17 +30,5 @@ struct Compare : private Base { } }; -struct LessOfComparator { - explicit LessOfComparator(const Comparator* c = BytewiseComparator()) - : cmp(c) {} - - bool operator()(const std::string& a, const std::string& b) const { - return cmp->Compare(Slice(a), Slice(b)) < 0; - } - - const Comparator* cmp; -}; - -typedef std::map KVMap; } } diff --git a/external/rocksdb/util/vectorrep.cc b/external/rocksdb/memtable/vectorrep.cc similarity index 98% rename from external/rocksdb/util/vectorrep.cc rename to external/rocksdb/memtable/vectorrep.cc index 017f89f7c7..b9d9ebe0a9 100644 --- a/external/rocksdb/util/vectorrep.cc +++ b/external/rocksdb/memtable/vectorrep.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,9 +14,9 @@ #include "util/arena.h" #include "db/memtable.h" +#include "memtable/stl_wrappers.h" #include "port/port.h" #include "util/mutexlock.h" -#include "util/stl_wrappers.h" namespace rocksdb { namespace { diff --git a/external/rocksdb/port/dirent.h b/external/rocksdb/port/dirent.h index ee4ded1433..f927db7e24 100644 --- a/external/rocksdb/port/dirent.h +++ b/external/rocksdb/port/dirent.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/port/likely.h b/external/rocksdb/port/likely.h index ede0df5a15..d6e6295cc0 100644 --- a/external/rocksdb/port/likely.h +++ b/external/rocksdb/port/likely.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/port/port.h b/external/rocksdb/port/port.h index e949cd286b..5f45dbb42c 100644 --- a/external/rocksdb/port/port.h +++ b/external/rocksdb/port/port.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,4 +19,3 @@ #elif defined(OS_WIN) #include "port/win/port_win.h" #endif - diff --git a/external/rocksdb/port/port_example.h b/external/rocksdb/port/port_example.h index ba14618fa4..e4bcb329b1 100644 --- a/external/rocksdb/port/port_example.h +++ b/external/rocksdb/port/port_example.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/port/port_posix.cc b/external/rocksdb/port/port_posix.cc index 773c6f1c37..1ad81ad885 100644 --- a/external/rocksdb/port/port_posix.cc +++ b/external/rocksdb/port/port_posix.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,6 +10,9 @@ #include "port/port_posix.h" #include +#if defined(__i386__) || defined(__x86_64__) +#include +#endif #include #include #include @@ -32,7 +35,7 @@ static int PthreadCall(const char* label, int result) { } Mutex::Mutex(bool adaptive) { -#ifdef OS_LINUX +#ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX if (!adaptive) { PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); } else { @@ -45,9 +48,9 @@ Mutex::Mutex(bool adaptive) { PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&mutex_attr)); } -#else // ignore adaptive for non-linux platform +#else PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); -#endif // OS_LINUX +#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX } Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } @@ -132,6 +135,19 @@ void RWMutex::ReadUnlock() { PthreadCall("read unlock", pthread_rwlock_unlock(&m void RWMutex::WriteUnlock() { PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); } +int PhysicalCoreID() { +#if defined(__i386__) || defined(__x86_64__) + // if you ever find that this function is hot on Linux, you can go from + // ~200 nanos to ~20 nanos by adding the machinery to use __vdso_getcpu + unsigned eax, ebx = 0, ecx, edx; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + return ebx >> 24; +#else + // getcpu or sched_getcpu could work here + return -1; +#endif +} + void InitOnce(OnceType* once, void (*initializer)()) { PthreadCall("once", pthread_once(once, initializer)); } diff --git a/external/rocksdb/port/port_posix.h b/external/rocksdb/port/port_posix.h index efb72ee106..15c4d0c0ae 100644 --- a/external/rocksdb/port/port_posix.h +++ b/external/rocksdb/port/port_posix.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,6 +16,8 @@ // in fact, we could use that one #define ROCKSDB_PRIszt "zu" +#define __declspec(S) + #define ROCKSDB_NOEXCEPT noexcept #undef PLATFORM_IS_LITTLE_ENDIAN @@ -32,22 +34,20 @@ #else #define PLATFORM_IS_LITTLE_ENDIAN false #endif -#elif defined(OS_FREEBSD) +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || \ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) #include #include #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) -#elif defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ - defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) - #include - #include #else #include #endif #include #include -#include #include +#include +#include #ifndef PLATFORM_IS_LITTLE_ENDIAN #define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) @@ -70,18 +70,17 @@ #if defined(OS_ANDROID) && __ANDROID_API__ < 9 // fdatasync() was only introduced in API level 9 on Android. Use fsync() -// when targetting older platforms. +// when targeting older platforms. #define fdatasync fsync #endif -#include - namespace rocksdb { namespace port { // For use at db/file_indexer.h kLevelMaxIndex const int kMaxInt32 = std::numeric_limits::max(); const uint64_t kMaxUint64 = std::numeric_limits::max(); +const int64_t kMaxInt64 = std::numeric_limits::max(); const size_t kMaxSizet = std::numeric_limits::max(); static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; @@ -145,6 +144,20 @@ class CondVar { Mutex* mu_; }; +static inline void AsmVolatilePause() { +#if defined(__i386__) || defined(__x86_64__) + asm volatile("pause"); +#elif defined(__aarch64__) + asm volatile("wfe"); +#elif defined(__powerpc64__) + asm volatile("or 27,27,27"); +#endif + // it's okay for other platforms to be no-ops +} + +// Returns -1 if not available on this platform +extern int PhysicalCoreID(); + typedef pthread_once_t OnceType; #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT extern void InitOnce(OnceType* once, void (*initializer)()); @@ -159,4 +172,3 @@ extern int GetMaxOpenFiles(); } // namespace port } // namespace rocksdb - diff --git a/external/rocksdb/port/stack_trace.cc b/external/rocksdb/port/stack_trace.cc index e2211e9876..bec0c994b8 100644 --- a/external/rocksdb/port/stack_trace.cc +++ b/external/rocksdb/port/stack_trace.cc @@ -1,12 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #include "port/stack_trace.h" -#if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX)) || \ - defined(CYGWIN) +#if defined(ROCKSDB_LITE) || !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || \ + defined(CYGWIN) || defined(OS_FREEBSD) // noop diff --git a/external/rocksdb/port/stack_trace.h b/external/rocksdb/port/stack_trace.h index 8bc6c7d2ec..3108b4d2e1 100644 --- a/external/rocksdb/port/stack_trace.h +++ b/external/rocksdb/port/stack_trace.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/port/sys_time.h b/external/rocksdb/port/sys_time.h index 6c23d8e501..53e646e69a 100644 --- a/external/rocksdb/port/sys_time.h +++ b/external/rocksdb/port/sys_time.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/port/util_logger.h b/external/rocksdb/port/util_logger.h index dbb67173f9..05782b0c58 100644 --- a/external/rocksdb/port/util_logger.h +++ b/external/rocksdb/port/util_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/port/win/env_default.cc b/external/rocksdb/port/win/env_default.cc new file mode 100644 index 0000000000..09c25c02b8 --- /dev/null +++ b/external/rocksdb/port/win/env_default.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include "port/win/env_win.h" + +namespace rocksdb { +namespace port { + +// We choose to create this on the heap and using std::once for the following +// reasons +// 1) Currently available MS compiler does not implement atomic C++11 +// initialization of +// function local statics +// 2) We choose not to destroy the env because joining the threads from the +// system loader +// which destroys the statics (same as from DLLMain) creates a system loader +// dead-lock. +// in this manner any remaining threads are terminated OK. +namespace { + std::once_flag winenv_once_flag; + Env* envptr; +}; + +} + +Env* Env::Default() { + using namespace port; + std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); }); + return envptr; +} + +} + diff --git a/external/rocksdb/port/win/env_win.cc b/external/rocksdb/port/win/env_win.cc index 9853ccbb53..2a718185f4 100644 --- a/external/rocksdb/port/win/env_win.cc +++ b/external/rocksdb/port/win/env_win.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,14 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include -#include #include #include #include -#include -#include -#include +#include // _getpid +#include // _access +#include // _rmdir, _mkdir, _getcwd #include #include @@ -25,12 +24,10 @@ #include "port/port.h" #include "port/dirent.h" #include "port/win/win_logger.h" +#include "port/win/io_win.h" +#include "port/win/env_win.h" -#include "util/random.h" #include "util/iostats_context_imp.h" -#include "util/rate_limiter.h" -#include "util/sync_point.h" -#include "util/aligned_buffer.h" #include "util/thread_status_updater.h" #include "util/thread_status_util.h" @@ -40,1520 +37,554 @@ namespace rocksdb { -std::string GetWindowsErrSz(DWORD err) { - LPSTR lpMsgBuf; - FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, - 0, // Default language - reinterpret_cast(&lpMsgBuf), 0, NULL); - - std::string Err = lpMsgBuf; - LocalFree(lpMsgBuf); - return Err; -} - -namespace { - -const size_t c_OneMB = (1 << 20); - ThreadStatusUpdater* CreateThreadStatusUpdater() { return new ThreadStatusUpdater(); } -// A wrapper for fadvise, if the platform doesn't support fadvise, -// it will simply return Status::NotSupport. -int Fadvise(int fd, off_t offset, size_t len, int advice) { - return 0; // simply do nothing. -} - -inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { - return Status::IOError(context, GetWindowsErrSz(err)); -} - -inline Status IOErrorFromLastWindowsError(const std::string& context) { - return IOErrorFromWindowsError(context, GetLastError()); -} - -inline Status IOError(const std::string& context, int err_number) { - return Status::IOError(context, strerror(err_number)); -} - -// TODO(sdong): temp logging. Need to help debugging. Remove it when -// the feature is proved to be stable. -inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) { - fprintf(stdout, "Bg thread %Iu terminates %Iu\n", thread_id, terminatingId); -} - -// returns the ID of the current process -inline int current_process_id() { return _getpid(); } +namespace { // RAII helpers for HANDLEs const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; typedef std::unique_ptr UniqueCloseHandlePtr; -// We preserve the original name of this interface to denote the original idea -// behind it. -// All reads happen by a specified offset and pwrite interface does not change -// the position of the file pointer. Judging from the man page and errno it does -// execute -// lseek atomically to return the position of the file back where it was. -// WriteFile() does not -// have this capability. Therefore, for both pread and pwrite the pointer is -// advanced to the next position -// which is fine for writes because they are (should be) sequential. -// Because all the reads/writes happen by the specified offset, the caller in -// theory should not -// rely on the current file offset. -SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, - uint64_t offset) { - OVERLAPPED overlapped = {0}; - ULARGE_INTEGER offsetUnion; - offsetUnion.QuadPart = offset; - - overlapped.Offset = offsetUnion.LowPart; - overlapped.OffsetHigh = offsetUnion.HighPart; - - SSIZE_T result = 0; - - unsigned long bytesWritten = 0; - - if (FALSE == WriteFile(hFile, src, numBytes, &bytesWritten, &overlapped)) { - result = -1; - } else { - result = bytesWritten; +void WinthreadCall(const char* label, std::error_code result) { + if (0 != result.value()) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value())); + abort(); } +} - return result; } -// See comments for pwrite above -SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) { - OVERLAPPED overlapped = {0}; - ULARGE_INTEGER offsetUnion; - offsetUnion.QuadPart = offset; +namespace port { - overlapped.Offset = offsetUnion.LowPart; - overlapped.OffsetHigh = offsetUnion.HighPart; +WinEnvIO::WinEnvIO(Env* hosted_env) + : hosted_env_(hosted_env), + page_size_(4 * 1012), + allocation_granularity_(page_size_), + perf_counter_frequency_(0), + GetSystemTimePreciseAsFileTime_(NULL) { - SSIZE_T result = 0; + SYSTEM_INFO sinfo; + GetSystemInfo(&sinfo); - unsigned long bytesRead = 0; + page_size_ = sinfo.dwPageSize; + allocation_granularity_ = sinfo.dwAllocationGranularity; - if (FALSE == ReadFile(hFile, src, numBytes, &bytesRead, &overlapped)) { - return -1; - } else { - result = bytesRead; + { + LARGE_INTEGER qpf; + BOOL ret = QueryPerformanceFrequency(&qpf); + assert(ret == TRUE); + perf_counter_frequency_ = qpf.QuadPart; } - return result; -} - -// Note the below two do not set errno because they are used only here in this -// file -// on a Windows handle and, therefore, not necessary. Translating GetLastError() -// to errno -// is a sad business -inline int fsync(HANDLE hFile) { - if (!FlushFileBuffers(hFile)) { - return -1; + HMODULE module = GetModuleHandle("kernel32.dll"); + if (module != NULL) { + GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)GetProcAddress( + module, "GetSystemTimePreciseAsFileTime"); } - - return 0; } -// SetFileInformationByHandle() is capable of fast pre-allocates. -// However, this does not change the file end position unless the file is -// truncated and the pre-allocated space is not considered filled with zeros. -inline Status fallocate(const std::string& filename, HANDLE hFile, - uint64_t to_size) { - Status status; +WinEnvIO::~WinEnvIO() { +} - FILE_ALLOCATION_INFO alloc_info; - alloc_info.AllocationSize.QuadPart = to_size; +Status WinEnvIO::DeleteFile(const std::string& fname) { + Status result; - if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, - sizeof(FILE_ALLOCATION_INFO))) { - auto lastError = GetLastError(); - status = IOErrorFromWindowsError( - "Failed to pre-allocate space: " + filename, lastError); + if (_unlink(fname.c_str())) { + result = IOError("Failed to delete: " + fname, errno); } - return status; + return result; } -inline Status ftruncate(const std::string& filename, HANDLE hFile, - uint64_t toSize) { - Status status; - - FILE_END_OF_FILE_INFO end_of_file; - end_of_file.EndOfFile.QuadPart = toSize; - - if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, - sizeof(FILE_END_OF_FILE_INFO))) { - auto lastError = GetLastError(); - status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, - lastError); +Status WinEnvIO::GetCurrentTime(int64_t* unix_time) { + time_t time = std::time(nullptr); + if (time == (time_t)(-1)) { + return Status::NotSupported("Failed to get time"); } - return status; + *unix_time = time; + return Status::OK(); } -// mmap() based random-access -class WinMmapReadableFile : public RandomAccessFile { - const std::string fileName_; - HANDLE hFile_; - HANDLE hMap_; - - const void* mapped_region_; - const size_t length_; - - public: - // mapped_region_[0,length-1] contains the mmapped contents of the file. - WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, - const void* mapped_region, size_t length) - : fileName_(fileName), - hFile_(hFile), - hMap_(hMap), - mapped_region_(mapped_region), - length_(length) {} - - ~WinMmapReadableFile() { - BOOL ret = ::UnmapViewOfFile(mapped_region_); - assert(ret); +Status WinEnvIO::NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status s; - ret = ::CloseHandle(hMap_); - assert(ret); + result->reset(); - ret = ::CloseHandle(hFile_); - assert(ret); - } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - Status s; - - if (offset > length_) { - *result = Slice(); - return IOError(fileName_, EINVAL); - } else if (offset + n > length_) { - n = length_ - offset; - } - *result = - Slice(reinterpret_cast(mapped_region_) + offset, n); - return s; - } - - virtual Status InvalidateCache(size_t offset, size_t length) override { - return Status::OK(); - } -}; - -// We preallocate up to an extra megabyte and use memcpy to append new -// data to the file. This is safe since we either properly close the -// file before reading from it, or for log files, the reading code -// knows enough to skip zero suffixes. -class WinMmapFile : public WritableFile { - private: - const std::string filename_; - HANDLE hFile_; - HANDLE hMap_; - - const size_t page_size_; // We flush the mapping view in page_size - // increments. We may decide if this is a memory - // page size or SSD page size - const size_t - allocation_granularity_; // View must start at such a granularity - size_t mapping_size_; // We want file mapping to be of a specific size - // because then the file is expandable - size_t view_size_; // How much memory to map into a view at a time - - char* mapped_begin_; // Must begin at the file offset that is aligned with - // allocation_granularity_ - char* mapped_end_; - char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_]) - char* last_sync_; // Where have we synced up to - - uint64_t file_offset_; // Offset of mapped_begin_ in file - - // Do we have unsynced writes? - bool pending_sync_; - - // Can only truncate or reserve to a sector size aligned if - // used on files that are opened with Unbuffered I/O - Status TruncateFile(uint64_t toSize) { - return ftruncate(filename_, hFile_, toSize); - } - - // Can only truncate or reserve to a sector size aligned if - // used on files that are opened with Unbuffered I/O - // Normally it does not present a problem since in memory mapped files - // we do not disable buffering - Status ReserveFileSpace(uint64_t toSize) { - IOSTATS_TIMER_GUARD(allocate_nanos); - return fallocate(filename_, hFile_, toSize); + // Corruption test needs to rename and delete files of these kind + // while they are still open with another handle. For that reason we + // allow share_write and delete(allows rename). + HANDLE hFile = INVALID_HANDLE_VALUE; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = CreateFileA( + fname.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, + OPEN_EXISTING, // Original fopen mode is "rb" + FILE_ATTRIBUTE_NORMAL, NULL); } - Status UnmapCurrentRegion() { - Status status; - - if (mapped_begin_ != nullptr) { - if (!::UnmapViewOfFile(mapped_begin_)) { - status = IOErrorFromWindowsError( - "Failed to unmap file view: " + filename_, GetLastError()); - } - - // UnmapView automatically sends data to disk but not the metadata - // which is good and provides some equivalent of fdatasync() on Linux - // therefore, we donot need separate flag for metadata - pending_sync_ = false; - mapped_begin_ = nullptr; - mapped_end_ = nullptr; - dst_ = nullptr; - last_sync_ = nullptr; - - // Move on to the next portion of the file - file_offset_ += view_size_; - - // Increase the amount we map the next time, but capped at 1MB - view_size_ *= 2; - view_size_ = std::min(view_size_, c_OneMB); - } - - return status; + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname, + lastError); + } else { + result->reset(new WinSequentialFile(fname, hFile, options)); } + return s; +} - Status MapNewRegion() { - Status status; - - assert(mapped_begin_ == nullptr); - - size_t minMappingSize = file_offset_ + view_size_; - - // Check if we need to create a new mapping since we want to write beyond - // the current one - // If the mapping view is now too short - // CreateFileMapping will extend the size of the file automatically if the - // mapping size is greater than - // the current length of the file, which reserves the space and makes - // writing faster, except, windows can not map an empty file. - // Thus the first time around we must actually extend the file ourselves - if (hMap_ == NULL || minMappingSize > mapping_size_) { - if (NULL == hMap_) { - // Creating mapping for the first time so reserve the space on disk - status = ReserveFileSpace(minMappingSize); - if (!status.ok()) { - return status; - } - } - - if (hMap_) { - // Unmap the previous one - BOOL ret = ::CloseHandle(hMap_); - assert(ret); - hMap_ = NULL; - } - - // Calculate the new mapping size which will hopefully reserve space for - // several consecutive sliding views - // Query preallocation block size if set - size_t preallocationBlockSize = 0; - size_t lastAllocatedBlockSize = 0; // Not used - GetPreallocationStatus(&preallocationBlockSize, &lastAllocatedBlockSize); - - if (preallocationBlockSize) { - preallocationBlockSize = - Roundup(preallocationBlockSize, allocation_granularity_); - } else { - preallocationBlockSize = 2 * view_size_; - } - - mapping_size_ += preallocationBlockSize; - - ULARGE_INTEGER mappingSize; - mappingSize.QuadPart = mapping_size_; - - hMap_ = CreateFileMappingA( - hFile_, - NULL, // Security attributes - PAGE_READWRITE, // There is not a write only mode for mapping - mappingSize.HighPart, // Enable mapping the whole file but the actual - // amount mapped is determined by MapViewOfFile - mappingSize.LowPart, - NULL); // Mapping name - - if (NULL == hMap_) { - return IOErrorFromWindowsError( - "WindowsMmapFile failed to create file mapping for: " + filename_, - GetLastError()); - } - } - - ULARGE_INTEGER offset; - offset.QuadPart = file_offset_; +Status WinEnvIO::NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; - // View must begin at the granularity aligned offset - mapped_begin_ = reinterpret_cast( - MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, - view_size_, NULL)); + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; - if (!mapped_begin_) { - status = IOErrorFromWindowsError( - "WindowsMmapFile failed to map file view: " + filename_, - GetLastError()); - } else { - mapped_end_ = mapped_begin_ + view_size_; - dst_ = mapped_begin_; - last_sync_ = mapped_begin_; - pending_sync_ = false; - } - return status; + if (!options.use_os_buffer && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } else { + fileFlags |= FILE_FLAG_RANDOM_ACCESS; } - public: - WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, - size_t allocation_granularity, const EnvOptions& options) - : filename_(fname), - hFile_(hFile), - hMap_(NULL), - page_size_(page_size), - allocation_granularity_(allocation_granularity), - mapping_size_(0), - view_size_(0), - mapped_begin_(nullptr), - mapped_end_(nullptr), - dst_(nullptr), - last_sync_(nullptr), - file_offset_(0), - pending_sync_(false) { - // Allocation granularity must be obtained from GetSystemInfo() and must be - // a power of two. - assert(allocation_granularity > 0); - assert((allocation_granularity & (allocation_granularity - 1)) == 0); - - assert(page_size > 0); - assert((page_size & (page_size - 1)) == 0); - - // Only for memory mapped writes - assert(options.use_mmap_writes); - - // Make sure buffering is not disabled. It is ignored for mapping - // purposes but also imposes restriction on moving file position - // it is not a problem so much with reserving space since it is probably a - // factor - // of allocation_granularity but we also want to truncate the file in - // Close() at - // arbitrary position so we do not have to feel this with zeros. - assert(options.use_os_buffer); - - // View size must be both the multiple of allocation_granularity AND the - // page size - if ((allocation_granularity_ % page_size_) == 0) { - view_size_ = 2 * allocation_granularity; - } else if ((page_size_ % allocation_granularity_) == 0) { - view_size_ = 2 * page_size_; - } else { - // we can multiply them together - assert(false); - } + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = + CreateFileA(fname.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, fileFlags, NULL); } - ~WinMmapFile() { - if (hFile_) { - this->Close(); - } + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomAccessFile failed to Create/Open: " + fname, lastError); } - virtual Status Append(const Slice& data) override { - const char* src = data.data(); - size_t left = data.size(); + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); - while (left > 0) { - assert(mapped_begin_ <= dst_); - size_t avail = mapped_end_ - dst_; + // CAUTION! This will map the entire file into the process address space + if (options.use_mmap_reads && sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. + uint64_t fileSize; - if (avail == 0) { - Status s = UnmapCurrentRegion(); - if (s.ok()) { - s = MapNewRegion(); - } + s = GetFileSize(fname, &fileSize); - if (!s.ok()) { - return s; - } + if (s.ok()) { + // Will not map empty files + if (fileSize == 0) { + return IOError( + "NewRandomAccessFile failed to map empty file: " + fname, EINVAL); } - size_t n = std::min(left, avail); - memcpy(dst_, src, n); - dst_ += n; - src += n; - left -= n; - pending_sync_ = true; - } + HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY, + 0, // Whole file at its present length + 0, + NULL); // Mapping name - return Status::OK(); - } - - // Means Close() will properly take care of truncate - // and it does not need any additional information - virtual Status Truncate(uint64_t size) override { - return Status::OK(); - } - - virtual Status Close() override { - Status s; - - assert(NULL != hFile_); - - // We truncate to the precise size so no - // uninitialized data at the end. SetEndOfFile - // which we use does not write zeros and it is good. - uint64_t targetSize = GetFileSize(); - - s = UnmapCurrentRegion(); - - if (NULL != hMap_) { - BOOL ret = ::CloseHandle(hMap_); - if (!ret && s.ok()) { + if (!hMap) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError( - "Failed to Close mapping for file: " + filename_, lastError); - } - - hMap_ = NULL; - } - - TruncateFile(targetSize); - - BOOL ret = ::CloseHandle(hFile_); - hFile_ = NULL; - - if (!ret && s.ok()) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError( - "Failed to close file map handle: " + filename_, lastError); - } - - return s; - } - - virtual Status Flush() override { return Status::OK(); } - - // Flush only data - virtual Status Sync() override { - Status s; - - // Some writes occurred since last sync - if (pending_sync_) { - assert(mapped_begin_); - assert(dst_); - assert(dst_ > mapped_begin_); - assert(dst_ < mapped_end_); - - size_t page_begin = - TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); - size_t page_end = - TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); - last_sync_ = dst_; - - // Flush only the amount of that is a multiple of pages - if (!::FlushViewOfFile(mapped_begin_ + page_begin, - (page_end - page_begin) + page_size_)) { - s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, - GetLastError()); - } - - pending_sync_ = false; - } - - return s; - } - - /** - * Flush data as well as metadata to stable storage. - */ - virtual Status Fsync() override { - Status s; - - // Flush metadata if pending - const bool pending = pending_sync_; - - s = Sync(); - - // Flush metadata - if (s.ok() && pending) { - if (!::FlushFileBuffers(hFile_)) { - s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, - GetLastError()); + return IOErrorFromWindowsError( + "Failed to create file mapping for NewRandomAccessFile: " + fname, + lastError); } - } - return s; - } - - /** - * Get the size of valid data in the file. This will not match the - * size that is returned from the filesystem because we use mmap - * to extend file by map_size every time. - */ - virtual uint64_t GetFileSize() override { - size_t used = dst_ - mapped_begin_; - return file_offset_ + used; - } - - virtual Status InvalidateCache(size_t offset, size_t length) override { - return Status::OK(); - } - - virtual Status Allocate(off_t offset, off_t len) override { - return Status::OK(); - } -}; - -class WinSequentialFile : public SequentialFile { - private: - const std::string filename_; - HANDLE file_; - - // There is no equivalent of advising away buffered pages as in posix. - // To implement this flag we would need to do unbuffered reads which - // will need to be aligned (not sure there is a guarantee that the buffer - // passed in is aligned). - // Hence we currently ignore this flag. It is used only in a few cases - // which should not be perf critical. - // If perf evaluation finds this to be a problem, we can look into - // implementing this. - bool use_os_buffer_; - - public: - WinSequentialFile(const std::string& fname, HANDLE f, - const EnvOptions& options) - : filename_(fname), - file_(f), - use_os_buffer_(options.use_os_buffer) {} - - virtual ~WinSequentialFile() { - assert(file_ != INVALID_HANDLE_VALUE); - CloseHandle(file_); - } + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); - virtual Status Read(size_t n, Slice* result, char* scratch) override { - Status s; - size_t r = 0; + const void* mapped_region = + MapViewOfFileEx(hMap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + fileSize, + NULL); // Let the OS choose the mapping - // Windows ReadFile API accepts a DWORD. - // While it is possible to read in a loop if n is > UINT_MAX - // it is a highly unlikely case. - if (n > UINT_MAX) { - return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER); - } - - DWORD bytesToRead = static_cast(n); //cast is safe due to the check above - DWORD bytesRead = 0; - BOOL ret = ReadFile(file_, scratch, bytesToRead, &bytesRead, NULL); - if (ret == TRUE) { - r = bytesRead; - } else { - return IOErrorFromWindowsError(filename_, GetLastError()); - } - - *result = Slice(scratch, r); - - return s; - } - - virtual Status Skip(uint64_t n) override { - // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit - // integer. As such it is a highly unlikley case to have n so large. - if (n > _I64_MAX) { - return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER); - } - - LARGE_INTEGER li; - li.QuadPart = static_cast(n); //cast is safe due to the check above - BOOL ret = SetFilePointerEx(file_, li, NULL, FILE_CURRENT); - if (ret == FALSE) { - return IOErrorFromWindowsError(filename_, GetLastError()); - } - return Status::OK(); - } - - virtual Status InvalidateCache(size_t offset, size_t length) override { - return Status::OK(); - } -}; - -// pread() based random-access -class WinRandomAccessFile : public RandomAccessFile { - const std::string filename_; - HANDLE hFile_; - const bool use_os_buffer_; - mutable std::mutex buffer_mut_; - mutable AlignedBuffer buffer_; - mutable uint64_t - buffered_start_; // file offset set that is currently buffered - - public: - WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options) - : filename_(fname), - hFile_(hFile), - use_os_buffer_(options.use_os_buffer), - buffer_(), - buffered_start_(0) { - assert(!options.use_mmap_reads); - - // Unbuffered access, use internal buffer for reads - if (!use_os_buffer_) { - buffer_.Alignment(alignment); - // Random read, no need in a big buffer - // We read things in database blocks which are likely to be similar to - // the alignment we use. - buffer_.AllocateNewBuffer(alignment * 2); - } - } - - virtual ~WinRandomAccessFile() { - if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { - ::CloseHandle(hFile_); - } - } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - Status s; - SSIZE_T r = -1; - size_t left = n; - char* dest = scratch; - - // When in unbuffered mode we need to do the following changes: - // - use our own aligned buffer - // - always read at the offset of that is a multiple of alignment - if (!use_os_buffer_) { - std::lock_guard lg(buffer_mut_); - - // Let's see if at least some of the requested data is already - // in the buffer - if (offset >= buffered_start_ && - offset < (buffered_start_ + buffer_.CurrentSize())) { - size_t buffer_offset = offset - buffered_start_; - r = buffer_.Read(dest, buffer_offset, left); - assert(r >= 0); - - left -= size_t(r); - offset += r; - dest += r; - } - - // Still some left or none was buffered - if (left > 0) { - // Figure out the start/end offset for reading and amount to read - const size_t alignment = buffer_.Alignment(); - const size_t start_page_start = - TruncateToPageBoundary(alignment, offset); - const size_t end_page_start = - TruncateToPageBoundary(alignment, offset + left - 1); - const size_t actual_bytes_toread = - (end_page_start - start_page_start) + alignment; - - if (buffer_.Capacity() < actual_bytes_toread) { - buffer_.AllocateNewBuffer(actual_bytes_toread); - } else { - buffer_.Clear(); - } - - SSIZE_T read = 0; - read = pread(hFile_, buffer_.Destination(), actual_bytes_toread, - start_page_start); - - if (read > 0) { - buffer_.Size(read); - buffered_start_ = start_page_start; - - // Let's figure out how much we read from the users standpoint - if ((buffered_start_ + uint64_t(read)) > offset) { - size_t buffer_offset = offset - buffered_start_; - r = buffer_.Read(dest, buffer_offset, left); - } else { - r = 0; - } - left -= r; - } else { - r = read; - } - } - - } else { - r = pread(hFile_, scratch, left, offset); - if (r > 0) { - left -= r; + if (!mapped_region) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewRandomAccessFile: " + fname, + lastError); } - } - - *result = Slice(scratch, (r < 0) ? 0 : n - left); - - if (r < 0) { - s = IOErrorFromLastWindowsError(filename_); - } - return s; - } - - virtual bool ShouldForwardRawRequest() const override { - return true; - } - - virtual void Hint(AccessPattern pattern) override {} - - virtual Status InvalidateCache(size_t offset, size_t length) override { - return Status::OK(); - } -}; - -// This is a sequential write class. It has been mimicked (as others) after -// the original Posix class. We add support for unbuffered I/O on windows as -// well -// we utilize the original buffer as an alignment buffer to write directly to -// file with no buffering. -// No buffering requires that the provided buffer is aligned to the physical -// sector size (SSD page size) and -// that all SetFilePointer() operations to occur with such an alignment. -// We thus always write in sector/page size increments to the drive and leave -// the tail for the next write OR for Close() at which point we pad with zeros. -// No padding is required for -// buffered access. -class WinWritableFile : public WritableFile { - private: - const std::string filename_; - HANDLE hFile_; - const bool use_os_buffer_; // Used to indicate unbuffered access, the file - const uint64_t alignment_; - // must be opened as unbuffered if false - uint64_t filesize_; // How much data is actually written disk - uint64_t reservedsize_; // how far we have reserved space - - public: - WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, - size_t capacity, const EnvOptions& options) - : filename_(fname), - hFile_(hFile), - use_os_buffer_(options.use_os_buffer), - alignment_(alignment), - filesize_(0), - reservedsize_(0) { - assert(!options.use_mmap_writes); - } - - ~WinWritableFile() { - if (NULL != hFile_ && INVALID_HANDLE_VALUE != hFile_) { - WinWritableFile::Close(); - } - } - - // Indicates if the class makes use of unbuffered I/O - virtual bool UseOSBuffer() const override { - return use_os_buffer_; - } - - virtual size_t GetRequiredBufferAlignment() const override { - return alignment_; - } - - virtual Status Append(const Slice& data) override { - - // Used for buffered access ONLY - assert(use_os_buffer_); - assert(data.size() < std::numeric_limits::max()); - - Status s; - - DWORD bytesWritten = 0; - if (!WriteFile(hFile_, data.data(), - data.size(), &bytesWritten, NULL)) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError( - "Failed to WriteFile: " + filename_, - lastError); - } else { - assert(size_t(bytesWritten) == data.size()); - filesize_ += data.size(); - } - - return s; - } - - virtual Status PositionedAppend(const Slice& data, uint64_t offset) override { - Status s; - SSIZE_T ret = pwrite(hFile_, data.data(), - data.size(), offset); + result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, + fileSize)); - // Error break - if (ret < 0) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError( - "Failed to pwrite for: " + filename_, lastError); - } else { - // With positional write it is not clear at all - // if this actually extends the filesize - assert(size_t(ret) == data.size()); - filesize_ += data.size(); - } - return s; - } - - // Need to implement this so the file is truncated correctly - // when buffered and unbuffered mode - virtual Status Truncate(uint64_t size) override { - Status s = ftruncate(filename_, hFile_, size); - if (s.ok()) { - filesize_ = size; + mapGuard.release(); + fileGuard.release(); } - return s; + } else { + result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options)); + fileGuard.release(); } + return s; +} - virtual Status Close() override { - - Status s; - - assert(INVALID_HANDLE_VALUE != hFile_); - - if (fsync(hFile_) < 0) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_, - lastError); - } +Status WinEnvIO::NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + const size_t c_BufferCapacity = 64 * 1024; - if (FALSE == ::CloseHandle(hFile_)) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_, - lastError); - } + EnvOptions local_options(options); - hFile_ = INVALID_HANDLE_VALUE; - return s; - } + result->reset(); + Status s; - // write out the cached data to the OS cache - // This is now taken care of the WritableFileWriter - virtual Status Flush() override { - return Status::OK(); - } + DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; - virtual Status Sync() override { - Status s; - // Calls flush buffers - if (fsync(hFile_) < 0) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_, - lastError); - } - return s; + if (!local_options.use_os_buffer && !local_options.use_mmap_writes) { + fileFlags = FILE_FLAG_NO_BUFFERING; } - virtual Status Fsync() override { return Sync(); } + // Desired access. We are want to write only here but if we want to memory + // map + // the file then there is no write only mode so we have to create it + // Read/Write + // However, MapViewOfFile specifies only Write only + DWORD desired_access = GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ; - virtual uint64_t GetFileSize() override { - // Double accounting now here with WritableFileWriter - // and this size will be wrong when unbuffered access is used - // but tests implement their own writable files and do not use WritableFileWrapper - // so we need to squeeze a square peg through - // a round hole here. - return filesize_; - } - - virtual Status Allocate(off_t offset, off_t len) override { - Status status; - TEST_KILL_RANDOM(rocksdb_kill_odds); - - // Make sure that we reserve an aligned amount of space - // since the reservation block size is driven outside so we want - // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(offset + len, alignment_); - // Nothing to do - if (spaceToReserve <= reservedsize_) { - return status; - } - - IOSTATS_TIMER_GUARD(allocate_nanos); - status = fallocate(filename_, hFile_, spaceToReserve); - if (status.ok()) { - reservedsize_ = spaceToReserve; - } - return status; - } -}; - -class WinDirectory : public Directory { - public: - WinDirectory() {} - - virtual Status Fsync() override { return Status::OK(); } -}; - -class WinFileLock : public FileLock { - public: - explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { - assert(hFile != NULL); - assert(hFile != INVALID_HANDLE_VALUE); + if (local_options.use_mmap_writes) { + desired_access |= GENERIC_READ; + } else { + // Adding this solely for tests to pass (fault_injection_test, + // wal_manager_test). + shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE); } - ~WinFileLock() { - BOOL ret = ::CloseHandle(hFile_); - assert(ret); + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = CreateFileA( + fname.c_str(), + desired_access, // Access desired + shared_mode, + NULL, // Security attributes + CREATE_ALWAYS, // Posix env says O_CREAT | O_RDWR | O_TRUNC + fileFlags, // Flags + NULL); // Template File + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a NewWriteableFile: " + fname, lastError); } - private: - HANDLE hFile_; -}; - -namespace { - -void WinthreadCall(const char* label, std::error_code result) { - if (0 != result.value()) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value())); - abort(); + if (options.use_mmap_writes) { + // We usually do not use mmmapping on SSD and thus we pass memory + // page_size + result->reset(new WinMmapFile(fname, hFile, page_size_, + allocation_granularity_, local_options)); + } else { + // Here we want the buffer allocation to be aligned by the SSD page size + // and to be a multiple of it + result->reset(new WinWritableFile(fname, hFile, page_size_, + c_BufferCapacity, local_options)); } + return s; } -} - -class WinEnv : public Env { - public: - WinEnv(); - virtual ~WinEnv() { - for (auto& th : threads_to_join_) { - th.join(); - } - - threads_to_join_.clear(); - - for (auto& thpool : thread_pools_) { - thpool.JoinAllThreads(); - } - // All threads must be joined before the deletion of - // thread_status_updater_. - delete thread_status_updater_; - } - - virtual Status DeleteFile(const std::string& fname) override { - Status result; - - if (_unlink(fname.c_str())) { - result = IOError("Failed to delete: " + fname, errno); - } - - return result; - } - - Status GetCurrentTime(int64_t* unix_time) override { - time_t time = std::time(nullptr); - if (time == (time_t)(-1)) { - return Status::NotSupported("Failed to get time"); - } - - *unix_time = time; - return Status::OK(); - } - - virtual Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - Status s; - - result->reset(); - - // Corruption test needs to rename and delete files of these kind - // while they are still open with another handle. For that reason we - // allow share_write and delete(allows rename). - HANDLE hFile = INVALID_HANDLE_VALUE; - { - IOSTATS_TIMER_GUARD(open_nanos); - hFile = CreateFileA( - fname.c_str(), GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, - OPEN_EXISTING, // Original fopen mode is "rb" - FILE_ATTRIBUTE_NORMAL, NULL); - } - - if (INVALID_HANDLE_VALUE == hFile) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname, - lastError); - } else { - result->reset(new WinSequentialFile(fname, hFile, options)); - } - return s; - } - - virtual Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - result->reset(); - Status s; - - // Open the file for read-only random access - // Random access is to disable read-ahead as the system reads too much data - DWORD fileFlags = FILE_ATTRIBUTE_READONLY; - - if (!options.use_os_buffer && !options.use_mmap_reads) { - fileFlags |= FILE_FLAG_NO_BUFFERING; - } else { - fileFlags |= FILE_FLAG_RANDOM_ACCESS; - } - - /// Shared access is necessary for corruption test to pass - // almost all tests would work with a possible exception of fault_injection - HANDLE hFile = 0; - { - IOSTATS_TIMER_GUARD(open_nanos); - hFile = - CreateFileA(fname.c_str(), GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, OPEN_EXISTING, fileFlags, NULL); - } - - if (INVALID_HANDLE_VALUE == hFile) { - auto lastError = GetLastError(); - return IOErrorFromWindowsError( - "NewRandomAccessFile failed to Create/Open: " + fname, lastError); - } - - UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); - - // CAUTION! This will map the entire file into the process address space - if (options.use_mmap_reads && sizeof(void*) >= 8) { - // Use mmap when virtual address-space is plentiful. - uint64_t fileSize; - - s = GetFileSize(fname, &fileSize); - - if (s.ok()) { - // Will not map empty files - if (fileSize == 0) { - return IOError( - "NewRandomAccessFile failed to map empty file: " + fname, EINVAL); - } - - HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY, - 0, // Whole file at its present length - 0, - NULL); // Mapping name - - if (!hMap) { - auto lastError = GetLastError(); - return IOErrorFromWindowsError( - "Failed to create file mapping for NewRandomAccessFile: " + fname, - lastError); - } - - UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); - - const void* mapped_region = - MapViewOfFileEx(hMap, FILE_MAP_READ, - 0, // High DWORD of access start - 0, // Low DWORD - fileSize, - NULL); // Let the OS choose the mapping - - if (!mapped_region) { - auto lastError = GetLastError(); - return IOErrorFromWindowsError( - "Failed to MapViewOfFile for NewRandomAccessFile: " + fname, - lastError); - } - - result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, - fileSize)); - - mapGuard.release(); - fileGuard.release(); - } - } else { - result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options)); - fileGuard.release(); - } - return s; +Status WinEnvIO::NewDirectory(const std::string& name, + std::unique_ptr* result) { + Status s; + // Must be nullptr on failure + result->reset(); + // Must fail if directory does not exist + if (!DirExists(name)) { + s = IOError("Directory does not exist: " + name, EEXIST); + } else { + IOSTATS_TIMER_GUARD(open_nanos); + result->reset(new WinDirectory); } + return s; +} - virtual Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - const size_t c_BufferCapacity = 64 * 1024; - - EnvOptions local_options(options); - - result->reset(); - Status s; - - DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; - - if (!local_options.use_os_buffer && !local_options.use_mmap_writes) { - fileFlags = FILE_FLAG_NO_BUFFERING; - } +Status WinEnvIO::FileExists(const std::string& fname) { + // F_OK == 0 + const int F_OK_ = 0; + return _access(fname.c_str(), F_OK_) == 0 ? Status::OK() + : Status::NotFound(); +} - // Desired access. We are want to write only here but if we want to memory - // map - // the file then there is no write only mode so we have to create it - // Read/Write - // However, MapViewOfFile specifies only Write only - DWORD desired_access = GENERIC_WRITE; - DWORD shared_mode = FILE_SHARE_READ; +Status WinEnvIO::GetChildren(const std::string& dir, + std::vector* result) { + std::vector output; - if (local_options.use_mmap_writes) { - desired_access |= GENERIC_READ; - } else { - // Adding this solely for tests to pass (fault_injection_test, - // wal_manager_test). - shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE); - } + Status status; - HANDLE hFile = 0; - { - IOSTATS_TIMER_GUARD(open_nanos); - hFile = CreateFileA( - fname.c_str(), - desired_access, // Access desired - shared_mode, - NULL, // Security attributes - CREATE_ALWAYS, // Posix env says O_CREAT | O_RDWR | O_TRUNC - fileFlags, // Flags - NULL); // Template File - } + auto CloseDir = [](DIR* p) { closedir(p); }; + std::unique_ptr dirp(opendir(dir.c_str()), + CloseDir); - if (INVALID_HANDLE_VALUE == hFile) { - auto lastError = GetLastError(); - return IOErrorFromWindowsError( - "Failed to create a NewWriteableFile: " + fname, lastError); - } - - if (options.use_mmap_writes) { - // We usually do not use mmmapping on SSD and thus we pass memory - // page_size - result->reset(new WinMmapFile(fname, hFile, page_size_, - allocation_granularity_, local_options)); - } else { - // Here we want the buffer allocation to be aligned by the SSD page size - // and to be a multiple of it - result->reset(new WinWritableFile(fname, hFile, page_size_, - c_BufferCapacity, local_options)); + if (!dirp) { + status = IOError(dir, errno); + } else { + if (result->capacity() > 0) { + output.reserve(result->capacity()); } - return s; - } - virtual Status NewDirectory(const std::string& name, - std::unique_ptr* result) override { - Status s; - // Must be nullptr on failure - result->reset(); - // Must fail if directory does not exist - if (!DirExists(name)) { - s = IOError("Directory does not exist: " + name, EEXIST); - } else { - IOSTATS_TIMER_GUARD(open_nanos); - result->reset(new WinDirectory); + struct dirent* ent = readdir(dirp.get()); + while (ent) { + output.push_back(ent->d_name); + ent = readdir(dirp.get()); } - return s; - } - - virtual Status FileExists(const std::string& fname) override { - // F_OK == 0 - const int F_OK_ = 0; - return _access(fname.c_str(), F_OK_) == 0 ? Status::OK() - : Status::NotFound(); } - virtual Status GetChildren(const std::string& dir, - std::vector* result) override { - std::vector output; - - Status status; - - auto CloseDir = [](DIR* p) { closedir(p); }; - std::unique_ptr dirp(opendir(dir.c_str()), - CloseDir); + output.swap(*result); - if (!dirp) { - status = IOError(dir, errno); - } else { - if (result->capacity() > 0) { - output.reserve(result->capacity()); - } - - struct dirent* ent = readdir(dirp.get()); - while (ent) { - output.push_back(ent->d_name); - ent = readdir(dirp.get()); - } - } - - output.swap(*result); - - return status; - } - - virtual Status CreateDir(const std::string& name) override { - Status result; + return status; +} - if (_mkdir(name.c_str()) != 0) { - auto code = errno; - result = IOError("Failed to create dir: " + name, code); - } +Status WinEnvIO::CreateDir(const std::string& name) { + Status result; - return result; + if (_mkdir(name.c_str()) != 0) { + auto code = errno; + result = IOError("Failed to create dir: " + name, code); } - virtual Status CreateDirIfMissing(const std::string& name) override { - Status result; - - if (DirExists(name)) { - return result; - } + return result; +} - if (_mkdir(name.c_str()) != 0) { - if (errno == EEXIST) { - result = - Status::IOError("`" + name + "' exists but is not a directory"); - } else { - auto code = errno; - result = IOError("Failed to create dir: " + name, code); - } - } +Status WinEnvIO::CreateDirIfMissing(const std::string& name) { + Status result; + if (DirExists(name)) { return result; } - virtual Status DeleteDir(const std::string& name) override { - Status result; - if (_rmdir(name.c_str()) != 0) { + if (_mkdir(name.c_str()) != 0) { + if (errno == EEXIST) { + result = + Status::IOError("`" + name + "' exists but is not a directory"); + } else { auto code = errno; - result = IOError("Failed to remove dir: " + name, code); + result = IOError("Failed to create dir: " + name, code); } - return result; } - virtual Status GetFileSize(const std::string& fname, - uint64_t* size) override { - Status s; + return result; +} - WIN32_FILE_ATTRIBUTE_DATA attrs; - if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) { - ULARGE_INTEGER file_size; - file_size.HighPart = attrs.nFileSizeHigh; - file_size.LowPart = attrs.nFileSizeLow; - *size = file_size.QuadPart; - } else { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError); - } - return s; +Status WinEnvIO::DeleteDir(const std::string& name) { + Status result; + if (_rmdir(name.c_str()) != 0) { + auto code = errno; + result = IOError("Failed to remove dir: " + name, code); } + return result; +} - static inline uint64_t FileTimeToUnixTime(const FILETIME& ftTime) { - const uint64_t c_FileTimePerSecond = 10000000U; - // UNIX epoch starts on 1970-01-01T00:00:00Z - // Windows FILETIME starts on 1601-01-01T00:00:00Z - // Therefore, we need to subtract the below number of seconds from - // the seconds that we obtain from FILETIME with an obvious loss of - // precision - const uint64_t c_SecondBeforeUnixEpoch = 11644473600U; - - ULARGE_INTEGER li; - li.HighPart = ftTime.dwHighDateTime; - li.LowPart = ftTime.dwLowDateTime; - - uint64_t result = - (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; - return result; +Status WinEnvIO::GetFileSize(const std::string& fname, + uint64_t* size) { + Status s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) { + ULARGE_INTEGER file_size; + file_size.HighPart = attrs.nFileSizeHigh; + file_size.LowPart = attrs.nFileSizeLow; + *size = file_size.QuadPart; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError); } + return s; +} - virtual Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) override { - Status s; +uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) { + const uint64_t c_FileTimePerSecond = 10000000U; + // UNIX epoch starts on 1970-01-01T00:00:00Z + // Windows FILETIME starts on 1601-01-01T00:00:00Z + // Therefore, we need to subtract the below number of seconds from + // the seconds that we obtain from FILETIME with an obvious loss of + // precision + const uint64_t c_SecondBeforeUnixEpoch = 11644473600U; + + ULARGE_INTEGER li; + li.HighPart = ftTime.dwHighDateTime; + li.LowPart = ftTime.dwLowDateTime; + + uint64_t result = + (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; + return result; +} - WIN32_FILE_ATTRIBUTE_DATA attrs; - if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) { - *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); - } else { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError( - "Can not get file modification time for: " + fname, lastError); - *file_mtime = 0; - } +Status WinEnvIO::GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + Status s; - return s; + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) { + *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Can not get file modification time for: " + fname, lastError); + *file_mtime = 0; } - virtual Status RenameFile(const std::string& src, - const std::string& target) override { - Status result; + return s; +} - // rename() is not capable of replacing the existing file as on Linux - // so use OS API directly - if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) { - DWORD lastError = GetLastError(); +Status WinEnvIO::RenameFile(const std::string& src, + const std::string& target) { + Status result; - std::string text("Failed to rename: "); - text.append(src).append(" to: ").append(target); + // rename() is not capable of replacing the existing file as on Linux + // so use OS API directly + if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) { + DWORD lastError = GetLastError(); - result = IOErrorFromWindowsError(text, lastError); - } + std::string text("Failed to rename: "); + text.append(src).append(" to: ").append(target); - return result; + result = IOErrorFromWindowsError(text, lastError); } - virtual Status LinkFile(const std::string& src, - const std::string& target) override { - Status result; + return result; +} - if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) { - DWORD lastError = GetLastError(); +Status WinEnvIO::LinkFile(const std::string& src, + const std::string& target) { + Status result; - std::string text("Failed to link: "); - text.append(src).append(" to: ").append(target); + if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) { + DWORD lastError = GetLastError(); - result = IOErrorFromWindowsError(text, lastError); - } + std::string text("Failed to link: "); + text.append(src).append(" to: ").append(target); - return result; + result = IOErrorFromWindowsError(text, lastError); } - virtual Status LockFile(const std::string& lockFname, - FileLock** lock) override { - assert(lock != nullptr); - - *lock = NULL; - Status result; + return result; +} - // No-sharing, this is a LOCK file - const DWORD ExclusiveAccessON = 0; +Status WinEnvIO::LockFile(const std::string& lockFname, + FileLock** lock) { + assert(lock != nullptr); - // Obtain exclusive access to the LOCK file - // Previously, instead of NORMAL attr we set DELETE on close and that worked - // well except with fault_injection test that insists on deleting it. - HANDLE hFile = 0; - { - IOSTATS_TIMER_GUARD(open_nanos); - hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE), - ExclusiveAccessON, NULL, CREATE_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); - } + *lock = NULL; + Status result; - if (INVALID_HANDLE_VALUE == hFile) { - auto lastError = GetLastError(); - result = IOErrorFromWindowsError( - "Failed to create lock file: " + lockFname, lastError); - } else { - *lock = new WinFileLock(hFile); - } + // No-sharing, this is a LOCK file + const DWORD ExclusiveAccessON = 0; - return result; + // Obtain exclusive access to the LOCK file + // Previously, instead of NORMAL attr we set DELETE on close and that worked + // well except with fault_injection test that insists on deleting it. + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE), + ExclusiveAccessON, NULL, CREATE_ALWAYS, + FILE_ATTRIBUTE_NORMAL, NULL); } - virtual Status UnlockFile(FileLock* lock) override { - Status result; - - assert(lock != nullptr); - - delete lock; - - return result; + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError( + "Failed to create lock file: " + lockFname, lastError); + } else { + *lock = new WinFileLock(hFile); } - virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW, - void* tag = nullptr) override; + return result; +} + +Status WinEnvIO::UnlockFile(FileLock* lock) { + Status result; - virtual int UnSchedule(void* arg, Priority pri) override; + assert(lock != nullptr); - virtual void StartThread(void (*function)(void* arg), void* arg) override; + delete lock; - virtual void WaitForJoin() override; + return result; +} - virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; +Status WinEnvIO::GetTestDirectory(std::string* result) { + std::string output; - virtual Status GetTestDirectory(std::string* result) override { - std::string output; + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + output = env; + CreateDir(output); + } else { + env = getenv("TMP"); - const char* env = getenv("TEST_TMPDIR"); if (env && env[0] != '\0') { output = env; - CreateDir(output); } else { - env = getenv("TMP"); - - if (env && env[0] != '\0') { - output = env; - } else { - output = "c:\\tmp"; - } - - CreateDir(output); + output = "c:\\tmp"; } - output.append("\\testrocksdb-"); - output.append(std::to_string(_getpid())); - CreateDir(output); - - output.swap(*result); - - return Status::OK(); } - virtual Status GetThreadList( - std::vector* thread_list) override { - assert(thread_status_updater_); - return thread_status_updater_->GetThreadList(thread_list); - } + output.append("\\testrocksdb-"); + output.append(std::to_string(_getpid())); - static uint64_t gettid() { - uint64_t thread_id = GetCurrentThreadId(); - return thread_id; - } + CreateDir(output); - virtual uint64_t GetThreadID() const override { return gettid(); } + output.swap(*result); - virtual Status NewLogger(const std::string& fname, - std::shared_ptr* result) override { - Status s; + return Status::OK(); +} - result->reset(); +Status WinEnvIO::NewLogger(const std::string& fname, + std::shared_ptr* result) { + Status s; - HANDLE hFile = 0; - { - IOSTATS_TIMER_GUARD(open_nanos); - hFile = CreateFileA( - fname.c_str(), GENERIC_WRITE, - FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are - // renamed and deleted before - // they are closed. This enables - // doing so. - NULL, - CREATE_ALWAYS, // Original fopen mode is "w" - FILE_ATTRIBUTE_NORMAL, NULL); - } + result->reset(); - if (INVALID_HANDLE_VALUE == hFile) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError); - } else { - { - // With log files we want to set the true creation time as of now - // because the system - // for some reason caches the attributes of the previous file that just - // been renamed from - // this name so auto_roll_logger_test fails - FILETIME ft; - GetSystemTimeAsFileTime(&ft); - // Set creation, last access and last write time to the same value - SetFileTime(hFile, &ft, &ft, &ft); - } - result->reset(new WinLogger(&WinEnv::gettid, this, hFile)); - } - return s; - } + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = CreateFileA( + fname.c_str(), GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are + // renamed and deleted before + // they are closed. This enables + // doing so. + NULL, + CREATE_ALWAYS, // Original fopen mode is "w" + FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError); + } else { + { + // With log files we want to set the true creation time as of now + // because the system + // for some reason caches the attributes of the previous file that just + // been renamed from + // this name so auto_roll_logger_test fails + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + // Set creation, last access and last write time to the same value + SetFileTime(hFile, &ft, &ft, &ft); + } + result->reset(new WinLogger(&WinEnvThreads::gettid, hosted_env_, hFile)); + } + return s; +} #if (_WIN32_WINNT >= _WIN32_WINNT_WIN8) - virtual uint64_t NowMicros() override { +uint64_t WinEnvIO::NowMicros() { + + if (GetSystemTimePreciseAsFileTime_ != NULL) { // all std::chrono clocks on windows proved to return // values that may repeat that is not good enough for some uses. const int64_t c_UnixEpochStartTicks = 116444736000000000i64; @@ -1563,7 +594,7 @@ class WinEnv : public Env { // just any microseconds because it is often used as an argument // to TimedWait() on condition variable FILETIME ftSystemTime; - GetSystemTimePreciseAsFileTime(&ftSystemTime); + GetSystemTimePreciseAsFileTime_(&ftSystemTime); LARGE_INTEGER li; li.LowPart = ftSystemTime.dwLowDateTime; @@ -1574,503 +605,417 @@ class WinEnv : public Env { li.QuadPart /= c_FtToMicroSec; return li.QuadPart; } + using namespace std::chrono; + return duration_cast(system_clock::now().time_since_epoch()).count(); +} #else - virtual uint64_t NowMicros() override { - // On Windows 7 and below, where GetSystemTimePreciseAsFileTime is not - // available, use QueryPerformanceCounter. Note that this may cause some - // of the tests to fail. - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - li.QuadPart *= std::micro::den; - li.QuadPart /= perf_counter_frequency_; - return li.QuadPart; - } +uint64_t WinEnvIO::NowMicros() { + // On Windows 7 and below, where GetSystemTimePreciseAsFileTime is not + // available, use QueryPerformanceCounter. Note that this may cause some + // of the tests to fail. + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + li.QuadPart *= std::micro::den; + li.QuadPart /= perf_counter_frequency_; + return li.QuadPart; +} #endif // (_WIN32_WINNT >= _WIN32_WINNT_WIN8) +uint64_t WinEnvIO::NowNanos() { + // all std::chrono clocks on windows have the same resolution that is only + // good enough for microseconds but not nanoseconds + // On Windows 8 and Windows 2012 Server + // GetSystemTimePreciseAsFileTime(¤t_time) can be used + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + // Convert to nanoseconds first to avoid loss of precision + // and divide by frequency + li.QuadPart *= std::nano::den; + li.QuadPart /= perf_counter_frequency_; + return li.QuadPart; +} - virtual uint64_t NowNanos() override { - // all std::chrono clocks on windows have the same resolution that is only - // good enough for microseconds but not nanoseconds - // On Windows 8 and Windows 2012 Server - // GetSystemTimePreciseAsFileTime(¤t_time) can be used - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - // Convert to nanoseconds first to avoid loss of precision - // and divide by frequency - li.QuadPart *= std::nano::den; - li.QuadPart /= perf_counter_frequency_; - return li.QuadPart; - } +Status WinEnvIO::GetHostName(char* name, uint64_t len) { + Status s; + DWORD nSize = static_cast( + std::min(len, std::numeric_limits::max())); - virtual void SleepForMicroseconds(int micros) override { - std::this_thread::sleep_for(std::chrono::microseconds(micros)); + if (!::GetComputerNameA(name, &nSize)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetHostName", lastError); + } else { + name[nSize] = 0; } - virtual Status GetHostName(char* name, uint64_t len) override { - Status s; - DWORD nSize = len; + return s; +} - if (!::GetComputerNameA(name, &nSize)) { - auto lastError = GetLastError(); - s = IOErrorFromWindowsError("GetHostName", lastError); - } else { - name[nSize] = 0; - } +Status WinEnvIO::GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + // Check if we already have an absolute path + // that starts with non dot and has a semicolon in it + if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) || + (db_path.size() > 2 && db_path[0] != '.' && + ((db_path[1] == ':' && db_path[2] == '\\') || + (db_path[1] == ':' && db_path[2] == '/')))) { + *output_path = db_path; + return Status::OK(); + } + + std::string result; + result.resize(_MAX_PATH); - return s; + char* ret = _getcwd(&result[0], _MAX_PATH); + if (ret == nullptr) { + return Status::IOError("Failed to get current working directory", + strerror(errno)); } - virtual Status GetCurrTime(int64_t* unix_time) { - Status s; + result.resize(strlen(result.data())); - time_t ret = time(nullptr); - if (ret == (time_t)-1) { - *unix_time = 0; - s = IOError("GetCurrTime", errno); - } else { - *unix_time = (int64_t)ret; - } + result.swap(*output_path); + return Status::OK(); +} - return s; - } +std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) { + std::string result; - virtual Status GetAbsolutePath(const std::string& db_path, - std::string* output_path) override { - // Check if we already have an absolute path - // that starts with non dot and has a semicolon in it - if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) || - (db_path.size() > 2 && db_path[0] != '.' && - ((db_path[1] == ':' && db_path[2] == '\\') || - (db_path[1] == ':' && db_path[2] == '/')))) { - *output_path = db_path; - return Status::OK(); - } + const time_t seconds = secondsSince1970; + const int maxsize = 64; - std::string result; - result.resize(_MAX_PATH); + struct tm t; + errno_t ret = localtime_s(&t, &seconds); - char* ret = _getcwd(&result[0], _MAX_PATH); - if (ret == nullptr) { - return Status::IOError("Failed to get current working directory", - strerror(errno)); - } + if (ret) { + result = std::to_string(seconds); + } else { + result.resize(maxsize); + char* p = &result[0]; - result.resize(strlen(result.data())); + int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec); + assert(len > 0); - result.swap(*output_path); - return Status::OK(); + result.resize(len); } - // Allow increasing the number of worker threads. - virtual void SetBackgroundThreads(int num, Priority pri) override { - assert(pri >= Priority::LOW && pri <= Priority::HIGH); - thread_pools_[pri].SetBackgroundThreads(num); + return result; +} + +EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized.use_os_buffer = + true; // This is because we flush only whole pages on unbuffered io and + // the last records are not guaranteed to be flushed. + // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it + // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit + // test and make this false + optimized.fallocate_with_keep_size = true; + return optimized; +} + +EnvOptions WinEnvIO::OptimizeForManifestWrite( + const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.use_os_buffer = true; + optimized.fallocate_with_keep_size = true; + return optimized; +} + +// Returns true iff the named directory exists and is a directory. +bool WinEnvIO::DirExists(const std::string& dname) { + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) { + return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); } + return false; +} + +//////////////////////////////////////////////////////////////////////// +// WinEnvThreads - virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { - assert(pri >= Priority::LOW && pri <= Priority::HIGH); - thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); +WinEnvThreads::WinEnvThreads(Env* hosted_env) : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) { + + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(hosted_env); } +} - virtual std::string TimeToString(uint64_t secondsSince1970) override { - std::string result; +WinEnvThreads::~WinEnvThreads() { - const time_t seconds = secondsSince1970; - const int maxsize = 64; + WaitForJoin(); - struct tm t; - errno_t ret = localtime_s(&t, &seconds); + for (auto& thpool : thread_pools_) { + thpool.JoinAllThreads(); + } +} - if (ret) { - result = std::to_string(seconds); - } else { - result.resize(maxsize); - char* p = &result[0]; +void WinEnvThreads::Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, void(*unschedFunction)(void* arg)) { + assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} - int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", - t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, - t.tm_min, t.tm_sec); - assert(len > 0); +int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} - result.resize(len); - } +namespace { - return result; - } + struct StartThreadState { + void(*user_function)(void*); + void* arg; + }; - EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const override { - EnvOptions optimized = env_options; - optimized.use_mmap_writes = false; - optimized.bytes_per_sync = db_options.wal_bytes_per_sync; - optimized.use_os_buffer = - true; // This is because we flush only whole pages on unbuffered io and - // the last records are not guaranteed to be flushed. - // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it - // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit - // test and make this false - optimized.fallocate_with_keep_size = true; - return optimized; + void* StartThreadWrapper(void* arg) { + std::unique_ptr state( + reinterpret_cast(arg)); + state->user_function(state->arg); + return nullptr; } - EnvOptions OptimizeForManifestWrite( - const EnvOptions& env_options) const override { - EnvOptions optimized = env_options; - optimized.use_mmap_writes = false; - optimized.use_os_buffer = true; - optimized.fallocate_with_keep_size = true; - return optimized; - } +} - private: - // Returns true iff the named directory exists and is a directory. - virtual bool DirExists(const std::string& dname) { - WIN32_FILE_ATTRIBUTE_DATA attrs; - if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) { - return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); - } - return false; +void WinEnvThreads::StartThread(void(*function)(void* arg), void* arg) { + std::unique_ptr state(new StartThreadState); + state->user_function = function; + state->arg = arg; + try { + + std::thread th(&StartThreadWrapper, state.get()); + state.release(); + + std::lock_guard lg(mu_); + threads_to_join_.push_back(std::move(th)); + + } catch (const std::system_error& ex) { + WinthreadCall("start thread", ex.code()); } +} - bool SupportsFastAllocate(const std::string& /* path */) { return false; } - - class ThreadPool { - public: - ThreadPool() - : total_threads_limit_(1), - bgthreads_(0), - queue_(), - queue_len_(0U), - exit_all_threads_(false), - low_io_priority_(false), - env_(nullptr) {} - - ~ThreadPool() { assert(bgthreads_.size() == 0U); } - - void JoinAllThreads() { - { - std::lock_guard lock(mu_); - assert(!exit_all_threads_); - exit_all_threads_ = true; - bgsignal_.notify_all(); - } +void WinEnvThreads::WaitForJoin() { + for (auto& th : threads_to_join_) { + th.join(); + } + threads_to_join_.clear(); +} - for (std::thread& th : bgthreads_) { - th.join(); - } +unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const { + assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} - // Subject to assert in the __dtor - bgthreads_.clear(); - } +uint64_t WinEnvThreads::gettid() { + uint64_t thread_id = GetCurrentThreadId(); + return thread_id; +} - void SetHostEnv(Env* env) { env_ = env; } +uint64_t WinEnvThreads::GetThreadID() const { return gettid(); } - // Return true if there is at least one thread needs to terminate. - bool HasExcessiveThread() const { - return bgthreads_.size() > total_threads_limit_; - } +void WinEnvThreads::SleepForMicroseconds(int micros) { + std::this_thread::sleep_for(std::chrono::microseconds(micros)); +} - // Return true iff the current thread is the excessive thread to terminate. - // Always terminate the running thread that is added last, even if there are - // more than one thread to terminate. - bool IsLastExcessiveThread(size_t thread_id) const { - return HasExcessiveThread() && thread_id == bgthreads_.size() - 1; - } +void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) { + assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); +} - // Is one of the threads to terminate. - bool IsExcessiveThread(size_t thread_id) const { - return thread_id >= total_threads_limit_; - } +void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); +} - // Return the thread priority. - // This would allow its member-thread to know its priority. - Env::Priority GetThreadPriority() { return priority_; } - - // Set the thread priority. - void SetThreadPriority(Env::Priority priority) { priority_ = priority; } - - void BGThread(size_t thread_id) { - while (true) { - // Wait until there is an item that is ready to run - std::unique_lock uniqueLock(mu_); - - // Stop waiting if the thread needs to do work or needs to terminate. - while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) && - (queue_.empty() || IsExcessiveThread(thread_id))) { - bgsignal_.wait(uniqueLock); - } - - if (exit_all_threads_) { - // mechanism to let BG threads exit safely - uniqueLock.unlock(); - break; - } - - if (IsLastExcessiveThread(thread_id)) { - // Current thread is the last generated one and is excessive. - // We always terminate excessive thread in the reverse order of - // generation time. - std::thread& terminating_thread = bgthreads_.back(); - auto tid = terminating_thread.get_id(); - // Ensure that that this thread is ours - assert(tid == std::this_thread::get_id()); - terminating_thread.detach(); - bgthreads_.pop_back(); - - if (HasExcessiveThread()) { - // There is still at least more excessive thread to terminate. - WakeUpAllThreads(); - } - - uniqueLock.unlock(); - - PrintThreadInfo(thread_id, gettid()); - break; - } - - void (*function)(void*) = queue_.front().function; - void* arg = queue_.front().arg; - queue_.pop_front(); - queue_len_.store(queue_.size(), std::memory_order_relaxed); - - uniqueLock.unlock(); - (*function)(arg); - } - } +///////////////////////////////////////////////////////////////////////// +// WinEnv - // Helper struct for passing arguments when creating threads. - struct BGThreadMetadata { - ThreadPool* thread_pool_; - size_t thread_id_; // Thread count in the thread. - - BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id) - : thread_pool_(thread_pool), thread_id_(thread_id) {} - }; - - static void* BGThreadWrapper(void* arg) { - std::unique_ptr meta( - reinterpret_cast(arg)); - - size_t thread_id = meta->thread_id_; - ThreadPool* tp = meta->thread_pool_; - -#if ROCKSDB_USING_THREAD_STATUS - // for thread-status - ThreadStatusUtil::RegisterThread( - tp->env_, (tp->GetThreadPriority() == Env::Priority::HIGH - ? ThreadStatus::HIGH_PRIORITY - : ThreadStatus::LOW_PRIORITY)); -#endif - tp->BGThread(thread_id); -#if ROCKSDB_USING_THREAD_STATUS - ThreadStatusUtil::UnregisterThread(); -#endif - return nullptr; - } +WinEnv::WinEnv() : winenv_io_(this), winenv_threads_(this) { + // Protected member of the base class + thread_status_updater_ = CreateThreadStatusUpdater(); +} - void WakeUpAllThreads() { bgsignal_.notify_all(); } - void SetBackgroundThreadsInternal(size_t num, bool allow_reduce) { - std::lock_guard lg(mu_); +WinEnv::~WinEnv() { + // All threads must be joined before the deletion of + // thread_status_updater_. + delete thread_status_updater_; +} - if (exit_all_threads_) { - return; - } +Status WinEnv::GetThreadList( + std::vector* thread_list) { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); +} - if (num > total_threads_limit_ || - (num < total_threads_limit_ && allow_reduce)) { - total_threads_limit_ = std::max(size_t(1), num); - WakeUpAllThreads(); - StartBGThreads(); - } - assert(total_threads_limit_ > 0); - } +Status WinEnv::DeleteFile(const std::string& fname) { + return winenv_io_.DeleteFile(fname); +} - void IncBackgroundThreadsIfNeeded(int num) { - SetBackgroundThreadsInternal(num, false); - } +Status WinEnv::GetCurrentTime(int64_t* unix_time) { + return winenv_io_.GetCurrentTime(unix_time); +} - void SetBackgroundThreads(int num) { - SetBackgroundThreadsInternal(num, true); - } +Status WinEnv::NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.NewSequentialFile(fname, result, options); +} - void StartBGThreads() { - // Start background thread if necessary - while (bgthreads_.size() < total_threads_limit_) { - std::thread p_t(&ThreadPool::BGThreadWrapper, - new BGThreadMetadata(this, bgthreads_.size())); - bgthreads_.push_back(std::move(p_t)); - } - } +Status WinEnv::NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.NewRandomAccessFile(fname, result, options); +} - void Schedule(void (*function)(void* arg1), void* arg, void* tag) { - std::lock_guard lg(mu_); +Status WinEnv::NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.NewWritableFile(fname, result, options); +} - if (exit_all_threads_) { - return; - } +Status WinEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + return winenv_io_.NewDirectory(name, result); +} - StartBGThreads(); - - // Add to priority queue - queue_.push_back(BGItem()); - queue_.back().function = function; - queue_.back().arg = arg; - queue_.back().tag = tag; - queue_len_.store(queue_.size(), std::memory_order_relaxed); - - if (!HasExcessiveThread()) { - // Wake up at least one waiting thread. - bgsignal_.notify_one(); - } else { - // Need to wake up all threads to make sure the one woken - // up is not the one to terminate. - WakeUpAllThreads(); - } - } +Status WinEnv::FileExists(const std::string& fname) { + return winenv_io_.FileExists(fname); +} - int UnSchedule(void* arg) { - int count = 0; +Status WinEnv::GetChildren(const std::string& dir, + std::vector* result) { + return winenv_io_.GetChildren(dir, result); +} - std::lock_guard lg(mu_); +Status WinEnv::CreateDir(const std::string& name) { + return winenv_io_.CreateDir(name); +} - // Remove from priority queue - BGQueue::iterator it = queue_.begin(); - while (it != queue_.end()) { - if (arg == (*it).tag) { - it = queue_.erase(it); - count++; - } else { - ++it; - } - } +Status WinEnv::CreateDirIfMissing(const std::string& name) { + return winenv_io_.CreateDirIfMissing(name); +} - queue_len_.store(queue_.size(), std::memory_order_relaxed); +Status WinEnv::DeleteDir(const std::string& name) { + return winenv_io_.DeleteDir(name); +} - return count; - } +Status WinEnv::GetFileSize(const std::string& fname, + uint64_t* size) { + return winenv_io_.GetFileSize(fname, size); +} - unsigned int GetQueueLen() const { - return static_cast( - queue_len_.load(std::memory_order_relaxed)); - } +Status WinEnv::GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + return winenv_io_.GetFileModificationTime(fname, file_mtime); +} - private: - // Entry per Schedule() call - struct BGItem { - void* arg; - void (*function)(void*); - void* tag; - }; - - typedef std::deque BGQueue; - - std::mutex mu_; - std::condition_variable bgsignal_; - size_t total_threads_limit_; - std::vector bgthreads_; - BGQueue queue_; - std::atomic_size_t queue_len_; // Queue length. Used for stats reporting - bool exit_all_threads_; - bool low_io_priority_; - Env::Priority priority_; - Env* env_; - }; +Status WinEnv::RenameFile(const std::string& src, + const std::string& target) { + return winenv_io_.RenameFile(src, target); +} - bool checkedDiskForMmap_; - bool forceMmapOff; // do we override Env options? - size_t page_size_; - size_t allocation_granularity_; - uint64_t perf_counter_frequency_; - std::vector thread_pools_; - mutable std::mutex mu_; - std::vector threads_to_join_; -}; - -WinEnv::WinEnv() - : checkedDiskForMmap_(false), - forceMmapOff(false), - page_size_(4 * 1012), - allocation_granularity_(page_size_), - perf_counter_frequency_(0), - thread_pools_(Priority::TOTAL) { - SYSTEM_INFO sinfo; - GetSystemInfo(&sinfo); +Status WinEnv::LinkFile(const std::string& src, + const std::string& target) { + return winenv_io_.LinkFile(src, target); +} - page_size_ = sinfo.dwPageSize; - allocation_granularity_ = sinfo.dwAllocationGranularity; +Status WinEnv::LockFile(const std::string& lockFname, + FileLock** lock) { + return winenv_io_.LockFile(lockFname, lock); +} - { - LARGE_INTEGER qpf; - BOOL ret = QueryPerformanceFrequency(&qpf); - assert(ret == TRUE); - perf_counter_frequency_ = qpf.QuadPart; - } +Status WinEnv::UnlockFile(FileLock* lock) { + return winenv_io_.UnlockFile(lock); +} - for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { - thread_pools_[pool_id].SetThreadPriority( - static_cast(pool_id)); - // This allows later initializing the thread-local-env of each thread. - thread_pools_[pool_id].SetHostEnv(this); - } +Status WinEnv::GetTestDirectory(std::string* result) { + return winenv_io_.GetTestDirectory(result); +} - // Protected member of the base class - thread_status_updater_ = CreateThreadStatusUpdater(); +Status WinEnv::NewLogger(const std::string& fname, + std::shared_ptr* result) { + return winenv_io_.NewLogger(fname, result); } -void WinEnv::Schedule(void (*function)(void*), void* arg, Priority pri, - void* tag) { - assert(pri >= Priority::LOW && pri <= Priority::HIGH); - thread_pools_[pri].Schedule(function, arg, tag); +uint64_t WinEnv::NowMicros() { + return winenv_io_.NowMicros(); } -int WinEnv::UnSchedule(void* arg, Priority pri) { - return thread_pools_[pri].UnSchedule(arg); +uint64_t WinEnv::NowNanos() { + return winenv_io_.NowNanos(); } -unsigned int WinEnv::GetThreadPoolQueueLen(Priority pri) const { - assert(pri >= Priority::LOW && pri <= Priority::HIGH); - return thread_pools_[pri].GetQueueLen(); +Status WinEnv::GetHostName(char* name, uint64_t len) { + return winenv_io_.GetHostName(name, len); } -namespace { -struct StartThreadState { - void (*user_function)(void*); - void* arg; -}; +Status WinEnv::GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return winenv_io_.GetAbsolutePath(db_path, output_path); } -static void* StartThreadWrapper(void* arg) { - std::unique_ptr state( - reinterpret_cast(arg)); - state->user_function(state->arg); - return nullptr; +std::string WinEnv::TimeToString(uint64_t secondsSince1970) { + return winenv_io_.TimeToString(secondsSince1970); } -void WinEnv::StartThread(void (*function)(void* arg), void* arg) { - StartThreadState* state = new StartThreadState; - state->user_function = function; - state->arg = arg; - try { - std::thread th(&StartThreadWrapper, state); +void WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, + void(*unschedFunction)(void* arg)) { + return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction); +} - std::lock_guard lg(mu_); - threads_to_join_.push_back(std::move(th)); +int WinEnv::UnSchedule(void* arg, Env::Priority pri) { + return winenv_threads_.UnSchedule(arg, pri); +} - } catch (const std::system_error& ex) { - WinthreadCall("start thread", ex.code()); - } +void WinEnv::StartThread(void(*function)(void* arg), void* arg) { + return winenv_threads_.StartThread(function, arg); } void WinEnv::WaitForJoin() { - for (auto& th : threads_to_join_) { - th.join(); - } + return winenv_threads_.WaitForJoin(); +} - threads_to_join_.clear(); +unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { + return winenv_threads_.GetThreadPoolQueueLen(pri); } -} // namespace +uint64_t WinEnv::GetThreadID() const { + return winenv_threads_.GetThreadID(); +} + +void WinEnv::SleepForMicroseconds(int micros) { + return winenv_threads_.SleepForMicroseconds(micros); +} + +// Allow increasing the number of worker threads. +void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { + return winenv_threads_.SetBackgroundThreads(num, pri); +} + +void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri); +} + +EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const { + return winenv_io_.OptimizeForLogWrite(env_options, db_options); +} + +EnvOptions WinEnv::OptimizeForManifestWrite( + const EnvOptions& env_options) const { + return winenv_io_.OptimizeForManifestWrite(env_options); +} + +} // namespace port std::string Env::GenerateUniqueId() { std::string result; @@ -2090,24 +1035,4 @@ std::string Env::GenerateUniqueId() { return result; } -// We choose to create this on the heap and using std::once for the following -// reasons -// 1) Currently available MS compiler does not implement atomic C++11 -// initialization of -// function local statics -// 2) We choose not to destroy the env because joining the threads from the -// system loader -// which destroys the statics (same as from DLLMain) creates a system loader -// dead-lock. -// in this manner any remaining threads are terminated OK. -namespace { -std::once_flag winenv_once_flag; -Env* envptr; -}; - -Env* Env::Default() { - std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); }); - return envptr; -} - } // namespace rocksdb diff --git a/external/rocksdb/port/win/env_win.h b/external/rocksdb/port/win/env_win.h new file mode 100644 index 0000000000..9b1e012c90 --- /dev/null +++ b/external/rocksdb/port/win/env_win.h @@ -0,0 +1,276 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once + +#include +#include "util/threadpool.h" + +#include +#include + +namespace rocksdb { +namespace port { + +// Currently not designed for inheritance but rather a replacement +class WinEnvThreads { +public: + + explicit WinEnvThreads(Env* hosted_env); + + ~WinEnvThreads(); + + WinEnvThreads(const WinEnvThreads&) = delete; + WinEnvThreads& operator=(const WinEnvThreads&) = delete; + + void Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, + void(*unschedFunction)(void* arg)); + + int UnSchedule(void* arg, Env::Priority pri); + + void StartThread(void(*function)(void* arg), void* arg); + + void WaitForJoin(); + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const; + + static uint64_t gettid(); + + uint64_t GetThreadID() const; + + void SleepForMicroseconds(int micros); + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri); + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri); + +private: + + Env* hosted_env_; + mutable std::mutex mu_; + std::vector thread_pools_; + std::vector threads_to_join_; + +}; + +// Designed for inheritance so can be re-used +// but certain parts replaced +class WinEnvIO { +public: + explicit WinEnvIO(Env* hosted_env); + + virtual ~WinEnvIO(); + + virtual Status DeleteFile(const std::string& fname); + + virtual Status GetCurrentTime(int64_t* unix_time); + + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result); + + virtual Status FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& dir, + std::vector* result); + + virtual Status CreateDir(const std::string& name); + + virtual Status CreateDirIfMissing(const std::string& name); + + virtual Status DeleteDir(const std::string& name); + + virtual Status GetFileSize(const std::string& fname, + uint64_t* size); + + static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime); + + virtual Status RenameFile(const std::string& src, + const std::string& target); + + virtual Status LinkFile(const std::string& src, + const std::string& target); + + virtual Status LockFile(const std::string& lockFname, + FileLock** lock); + + virtual Status UnlockFile(FileLock* lock); + + virtual Status GetTestDirectory(std::string* result); + + virtual Status NewLogger(const std::string& fname, + std::shared_ptr* result); + + virtual uint64_t NowMicros(); + + virtual uint64_t NowNanos(); + + virtual Status GetHostName(char* name, uint64_t len); + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path); + + virtual std::string TimeToString(uint64_t secondsSince1970); + + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const; + + virtual EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const; + + size_t GetPageSize() const { return page_size_; } + + size_t GetAllocationGranularity() const { return allocation_granularity_; } + + uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } + +private: + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname); + + typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME); + + Env* hosted_env_; + size_t page_size_; + size_t allocation_granularity_; + uint64_t perf_counter_frequency_; + FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; +}; + +class WinEnv : public Env { +public: + WinEnv(); + + ~WinEnv(); + + Status DeleteFile(const std::string& fname) override; + + Status GetCurrentTime(int64_t* unix_time) override; + + Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override; + + Status FileExists(const std::string& fname) override; + + Status GetChildren(const std::string& dir, + std::vector* result) override; + + Status CreateDir(const std::string& name) override; + + Status CreateDirIfMissing(const std::string& name) override; + + Status DeleteDir(const std::string& name) override; + + Status GetFileSize(const std::string& fname, + uint64_t* size) override; + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override; + + Status RenameFile(const std::string& src, + const std::string& target) override; + + Status LinkFile(const std::string& src, + const std::string& target) override; + + Status LockFile(const std::string& lockFname, + FileLock** lock) override; + + Status UnlockFile(FileLock* lock) override; + + Status GetTestDirectory(std::string* result) override; + + Status NewLogger(const std::string& fname, + std::shared_ptr* result) override; + + uint64_t NowMicros() override; + + uint64_t NowNanos() override; + + Status GetHostName(char* name, uint64_t len) override; + + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override; + + std::string TimeToString(uint64_t secondsSince1970) override; + + Status GetThreadList( + std::vector* thread_list) override; + + void Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, + void(*unschedFunction)(void* arg)) override; + + int UnSchedule(void* arg, Env::Priority pri) override; + + void StartThread(void(*function)(void* arg), void* arg) override; + + void WaitForJoin(); + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override; + + uint64_t GetThreadID() const override; + + void SleepForMicroseconds(int micros) override; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri) override; + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; + + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override; + + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override; + +private: + + WinEnvIO winenv_io_; + WinEnvThreads winenv_threads_; + +}; + +} +} \ No newline at end of file diff --git a/external/rocksdb/port/win/io_win.cc b/external/rocksdb/port/win/io_win.cc new file mode 100644 index 0000000000..c9ef1f29e1 --- /dev/null +++ b/external/rocksdb/port/win/io_win.cc @@ -0,0 +1,963 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/win/io_win.h" + +#include "util/sync_point.h" +#include "util/coding.h" +#include "util/iostats_context_imp.h" +#include "util/sync_point.h" +#include "util/aligned_buffer.h" + + +namespace rocksdb { +namespace port { + +std::string GetWindowsErrSz(DWORD err) { + LPSTR lpMsgBuf; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast(&lpMsgBuf), 0, NULL); + + std::string Err = lpMsgBuf; + LocalFree(lpMsgBuf); + return Err; +} + +// We preserve the original name of this interface to denote the original idea +// behind it. +// All reads happen by a specified offset and pwrite interface does not change +// the position of the file pointer. Judging from the man page and errno it does +// execute +// lseek atomically to return the position of the file back where it was. +// WriteFile() does not +// have this capability. Therefore, for both pread and pwrite the pointer is +// advanced to the next position +// which is fine for writes because they are (should be) sequential. +// Because all the reads/writes happen by the specified offset, the caller in +// theory should not +// rely on the current file offset. +SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, + uint64_t offset) { + assert(numBytes <= std::numeric_limits::max()); + OVERLAPPED overlapped = { 0 }; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + SSIZE_T result = 0; + + unsigned long bytesWritten = 0; + + if (FALSE == WriteFile(hFile, src, static_cast(numBytes), &bytesWritten, + &overlapped)) { + result = -1; + } else { + result = bytesWritten; + } + + return result; +} + +// See comments for pwrite above +SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) { + assert(numBytes <= std::numeric_limits::max()); + OVERLAPPED overlapped = { 0 }; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + SSIZE_T result = 0; + + unsigned long bytesRead = 0; + + if (FALSE == ReadFile(hFile, src, static_cast(numBytes), &bytesRead, + &overlapped)) { + return -1; + } else { + result = bytesRead; + } + + return result; +} + +// SetFileInformationByHandle() is capable of fast pre-allocates. +// However, this does not change the file end position unless the file is +// truncated and the pre-allocated space is not considered filled with zeros. +Status fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + Status status; + + FILE_ALLOCATION_INFO alloc_info; + alloc_info.AllocationSize.QuadPart = to_size; + + if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, + sizeof(FILE_ALLOCATION_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError( + "Failed to pre-allocate space: " + filename, lastError); + } + + return status; +} + +Status ftruncate(const std::string& filename, HANDLE hFile, + uint64_t toSize) { + Status status; + + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = toSize; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, + lastError); + } + + return status; +} + +size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) { + + if (max_size < kMaxVarint64Length * 3) { + return 0; + } + + BY_HANDLE_FILE_INFORMATION FileInfo; + + BOOL result = GetFileInformationByHandle(hFile, &FileInfo); + + TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); + + if (!result) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber)); + rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh)); + rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow)); + + assert(rid >= id); + return static_cast(rid - id); +} + +WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, + const void* mapped_region, size_t length) + : fileName_(fileName), + hFile_(hFile), + hMap_(hMap), + mapped_region_(mapped_region), + length_(length) {} + +WinMmapReadableFile::~WinMmapReadableFile() { + BOOL ret = ::UnmapViewOfFile(mapped_region_); + assert(ret); + + ret = ::CloseHandle(hMap_); + assert(ret); + + ret = ::CloseHandle(hFile_); + assert(ret); +} + +Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + + if (offset > length_) { + *result = Slice(); + return IOError(fileName_, EINVAL); + } else if (offset + n > length_) { + n = length_ - offset; + } + *result = + Slice(reinterpret_cast(mapped_region_)+offset, n); + return s; +} + +Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +// Can only truncate or reserve to a sector size aligned if +// used on files that are opened with Unbuffered I/O +Status WinMmapFile::TruncateFile(uint64_t toSize) { + return ftruncate(filename_, hFile_, toSize); +} + +Status WinMmapFile::UnmapCurrentRegion() { + Status status; + + if (mapped_begin_ != nullptr) { + if (!::UnmapViewOfFile(mapped_begin_)) { + status = IOErrorFromWindowsError( + "Failed to unmap file view: " + filename_, GetLastError()); + } + + // Move on to the next portion of the file + file_offset_ += view_size_; + + // UnmapView automatically sends data to disk but not the metadata + // which is good and provides some equivalent of fdatasync() on Linux + // therefore, we donot need separate flag for metadata + mapped_begin_ = nullptr; + mapped_end_ = nullptr; + dst_ = nullptr; + + last_sync_ = nullptr; + pending_sync_ = false; + } + + return status; +} + +Status WinMmapFile::MapNewRegion() { + + Status status; + + assert(mapped_begin_ == nullptr); + + size_t minDiskSize = file_offset_ + view_size_; + + if (minDiskSize > reserved_size_) { + status = Allocate(file_offset_, view_size_); + if (!status.ok()) { + return status; + } + } + + // Need to remap + if (hMap_ == NULL || reserved_size_ > mapping_size_) { + + if (hMap_ != NULL) { + // Unmap the previous one + BOOL ret = ::CloseHandle(hMap_); + assert(ret); + hMap_ = NULL; + } + + ULARGE_INTEGER mappingSize; + mappingSize.QuadPart = reserved_size_; + + hMap_ = CreateFileMappingA( + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name + + if (NULL == hMap_) { + return IOErrorFromWindowsError( + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); + } + + mapping_size_ = reserved_size_; + } + + ULARGE_INTEGER offset; + offset.QuadPart = file_offset_; + + // View must begin at the granularity aligned offset + mapped_begin_ = reinterpret_cast( + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); + + if (!mapped_begin_) { + status = IOErrorFromWindowsError( + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); + } else { + mapped_end_ = mapped_begin_ + view_size_; + dst_ = mapped_begin_; + last_sync_ = mapped_begin_; + pending_sync_ = false; + } + return status; +} + +Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(filename_, hFile_, spaceToReserve); +} + +WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, + size_t allocation_granularity, const EnvOptions& options) + : filename_(fname), + hFile_(hFile), + hMap_(NULL), + page_size_(page_size), + allocation_granularity_(allocation_granularity), + reserved_size_(0), + mapping_size_(0), + view_size_(0), + mapped_begin_(nullptr), + mapped_end_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { + // Allocation granularity must be obtained from GetSystemInfo() and must be + // a power of two. + assert(allocation_granularity > 0); + assert((allocation_granularity & (allocation_granularity - 1)) == 0); + + assert(page_size > 0); + assert((page_size & (page_size - 1)) == 0); + + // Only for memory mapped writes + assert(options.use_mmap_writes); + + // View size must be both the multiple of allocation_granularity AND the + // page size and the granularity is usually a multiple of a page size. + const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + view_size_ = Roundup(viewSize, allocation_granularity_); +} + +WinMmapFile::~WinMmapFile() { + if (hFile_) { + this->Close(); + } +} + +Status WinMmapFile::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + + while (left > 0) { + assert(mapped_begin_ <= dst_); + size_t avail = mapped_end_ - dst_; + + if (avail == 0) { + Status s = UnmapCurrentRegion(); + if (s.ok()) { + s = MapNewRegion(); + } + + if (!s.ok()) { + return s; + } + } else { + size_t n = std::min(left, avail); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + pending_sync_ = true; + } + } + + // Now make sure that the last partial page is padded with zeros if needed + size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_); + if (bytesToPad > 0) { + memset(dst_, 0, bytesToPad); + } + + return Status::OK(); +} + +// Means Close() will properly take care of truncate +// and it does not need any additional information +Status WinMmapFile::Truncate(uint64_t size) { + return Status::OK(); +} + +Status WinMmapFile::Close() { + Status s; + + assert(NULL != hFile_); + + // We truncate to the precise size so no + // uninitialized data at the end. SetEndOfFile + // which we use does not write zeros and it is good. + uint64_t targetSize = GetFileSize(); + + if (mapped_begin_ != nullptr) { + // Sync before unmapping to make sure everything + // is on disk and there is not a lazy writing + // so we are deterministic with the tests + Sync(); + s = UnmapCurrentRegion(); + } + + if (NULL != hMap_) { + BOOL ret = ::CloseHandle(hMap_); + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to Close mapping for file: " + filename_, lastError); + } + + hMap_ = NULL; + } + + if (hFile_ != NULL) { + + TruncateFile(targetSize); + + BOOL ret = ::CloseHandle(hFile_); + hFile_ = NULL; + + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to close file map handle: " + filename_, lastError); + } + } + + return s; +} + +Status WinMmapFile::Flush() { return Status::OK(); } + +// Flush only data +Status WinMmapFile::Sync() { + Status s; + + // Some writes occurred since last sync + if (dst_ > last_sync_) { + assert(mapped_begin_); + assert(dst_); + assert(dst_ > mapped_begin_); + assert(dst_ < mapped_end_); + + size_t page_begin = + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + size_t page_end = + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + + // Flush only the amount of that is a multiple of pages + if (!::FlushViewOfFile(mapped_begin_ + page_begin, + (page_end - page_begin) + page_size_)) { + s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, + GetLastError()); + } else { + last_sync_ = dst_; + } + } + + return s; +} + +/** +* Flush data as well as metadata to stable storage. +*/ +Status WinMmapFile::Fsync() { + Status s = Sync(); + + // Flush metadata + if (s.ok() && pending_sync_) { + if (!::FlushFileBuffers(hFile_)) { + s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, + GetLastError()); + } + pending_sync_ = false; + } + + return s; +} + +/** +* Get the size of valid data in the file. This will not match the +* size that is returned from the filesystem because we use mmap +* to extend file by map_size every time. +*/ +uint64_t WinMmapFile::GetFileSize() { + size_t used = dst_ - mapped_begin_; + return file_offset_ + used; +} + +Status WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) { + Status status; + TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(offset + len, view_size_); + // Nothing to do + if (spaceToReserve <= reserved_size_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reserved_size_ = spaceToReserve; + } + return status; +} + +size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, + const EnvOptions& options) + : filename_(fname), + file_(f), + use_os_buffer_(options.use_os_buffer) +{} + +WinSequentialFile::~WinSequentialFile() { + assert(file_ != INVALID_HANDLE_VALUE); + CloseHandle(file_); +} + +Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = 0; + + // Windows ReadFile API accepts a DWORD. + // While it is possible to read in a loop if n is > UINT_MAX + // it is a highly unlikely case. + if (n > UINT_MAX) { + return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER); + } + + DWORD bytesToRead = static_cast(n); //cast is safe due to the check above + DWORD bytesRead = 0; + BOOL ret = ReadFile(file_, scratch, bytesToRead, &bytesRead, NULL); + if (ret == TRUE) { + r = bytesRead; + } else { + return IOErrorFromWindowsError(filename_, GetLastError()); + } + + *result = Slice(scratch, r); + + return s; +} + +Status WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit + // integer. As such it is a highly unlikley case to have n so large. + if (n > _I64_MAX) { + return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER); + } + + LARGE_INTEGER li; + li.QuadPart = static_cast(n); //cast is safe due to the check above + BOOL ret = SetFilePointerEx(file_, li, NULL, FILE_CURRENT); + if (ret == FALSE) { + return IOErrorFromWindowsError(filename_, GetLastError()); + } + return Status::OK(); +} + +Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +SSIZE_T WinRandomAccessFile::ReadIntoBuffer(uint64_t user_offset, uint64_t first_page_start, + size_t bytes_to_read, size_t& left, + AlignedBuffer& buffer, char* dest) const { + assert(buffer.CurrentSize() == 0); + assert(buffer.Capacity() >= bytes_to_read); + + SSIZE_T read = + PositionedReadInternal(buffer.Destination(), bytes_to_read, first_page_start); + + if (read > 0) { + buffer.Size(read); + + // Let's figure out how much we read from the users standpoint + if ((first_page_start + buffer.CurrentSize()) > user_offset) { + assert(first_page_start <= user_offset); + size_t buffer_offset = user_offset - first_page_start; + read = buffer.Read(dest, buffer_offset, left); + } else { + read = 0; + } + left -= read; + } + return read; +} + +SSIZE_T WinRandomAccessFile::ReadIntoOneShotBuffer(uint64_t user_offset, uint64_t first_page_start, + size_t bytes_to_read, size_t& left, + char* dest) const { + AlignedBuffer bigBuffer; + bigBuffer.Alignment(buffer_.Alignment()); + bigBuffer.AllocateNewBuffer(bytes_to_read); + + return ReadIntoBuffer(user_offset, first_page_start, bytes_to_read, left, + bigBuffer, dest); +} + +SSIZE_T WinRandomAccessFile::ReadIntoInstanceBuffer(uint64_t user_offset, + uint64_t first_page_start, + size_t bytes_to_read, size_t& left, + char* dest) const { + SSIZE_T read = ReadIntoBuffer(user_offset, first_page_start, bytes_to_read, + left, buffer_, dest); + + if (read > 0) { + buffered_start_ = first_page_start; + } + + return read; +} + +void WinRandomAccessFile::CalculateReadParameters(uint64_t offset, size_t bytes_requested, + size_t& actual_bytes_toread, + uint64_t& first_page_start) const { + + const size_t alignment = buffer_.Alignment(); + + first_page_start = TruncateToPageBoundary(alignment, offset); + const uint64_t last_page_start = + TruncateToPageBoundary(alignment, offset + bytes_requested - 1); + actual_bytes_toread = (last_page_start - first_page_start) + alignment; +} + +SSIZE_T WinRandomAccessFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset) const { + return pread(hFile_, src, numBytes, offset); +} + +WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, + const EnvOptions& options) + : filename_(fname), + hFile_(hFile), + use_os_buffer_(options.use_os_buffer), + read_ahead_(false), + compaction_readahead_size_(options.compaction_readahead_size), + random_access_max_buffer_size_(options.random_access_max_buffer_size), + buffer_(), + buffered_start_(0) { + assert(!options.use_mmap_reads); + + // Unbuffered access, use internal buffer for reads + if (!use_os_buffer_) { + // Do not allocate the buffer either until the first request or + // until there is a call to allocate a read-ahead buffer + buffer_.Alignment(alignment); + } +} + +WinRandomAccessFile::~WinRandomAccessFile() { + if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { + ::CloseHandle(hFile_); + } +} + +void WinRandomAccessFile::EnableReadAhead() { this->Hint(SEQUENTIAL); } + +Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + + Status s; + SSIZE_T r = -1; + size_t left = n; + char* dest = scratch; + + if (n == 0) { + *result = Slice(scratch, 0); + return s; + } + + // When in unbuffered mode we need to do the following changes: + // - use our own aligned buffer + // - always read at the offset of that is a multiple of alignment + if (!use_os_buffer_) { + + uint64_t first_page_start = 0; + size_t actual_bytes_toread = 0; + size_t bytes_requested = left; + + if (!read_ahead_ && random_access_max_buffer_size_ == 0) { + CalculateReadParameters(offset, bytes_requested, actual_bytes_toread, + first_page_start); + + assert(actual_bytes_toread > 0); + + r = ReadIntoOneShotBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } else { + + std::unique_lock lock(buffer_mut_); + + // Let's see if at least some of the requested data is already + // in the buffer + if (offset >= buffered_start_ && + offset < (buffered_start_ + buffer_.CurrentSize())) { + size_t buffer_offset = offset - buffered_start_; + r = buffer_.Read(dest, buffer_offset, left); + assert(r >= 0); + + left -= size_t(r); + offset += r; + dest += r; + } + + // Still some left or none was buffered + if (left > 0) { + // Figure out the start/end offset for reading and amount to read + bytes_requested = left; + + if (read_ahead_ && bytes_requested < compaction_readahead_size_) { + bytes_requested = compaction_readahead_size_; + } + + CalculateReadParameters(offset, bytes_requested, actual_bytes_toread, + first_page_start); + + assert(actual_bytes_toread > 0); + + if (buffer_.Capacity() < actual_bytes_toread) { + // If we are in read-ahead mode or the requested size + // exceeds max buffer size then use one-shot + // big buffer otherwise reallocate main buffer + if (read_ahead_ || + (actual_bytes_toread > random_access_max_buffer_size_)) { + // Unlock the mutex since we are not using instance buffer + lock.unlock(); + r = ReadIntoOneShotBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } else { + buffer_.AllocateNewBuffer(actual_bytes_toread); + r = ReadIntoInstanceBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } + } else { + buffer_.Clear(); + r = ReadIntoInstanceBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } + } + } + } else { + r = PositionedReadInternal(scratch, left, offset); + if (r > 0) { + left -= r; + } + } + + *result = Slice(scratch, (r < 0) ? 0 : n - left); + + if (r < 0) { + s = IOErrorFromLastWindowsError(filename_); + } + return s; +} + +bool WinRandomAccessFile::ShouldForwardRawRequest() const { + return true; +} + +void WinRandomAccessFile::Hint(AccessPattern pattern) { + if (pattern == SEQUENTIAL && !use_os_buffer_ && + compaction_readahead_size_ > 0) { + std::lock_guard lg(buffer_mut_); + if (!read_ahead_) { + read_ahead_ = true; + // This would allocate read-ahead size + 2 alignments + // - one for memory alignment which added implicitly by AlignedBuffer + // - We add one more alignment because we will read one alignment more + // from disk + buffer_.AllocateNewBuffer(compaction_readahead_size_ + + buffer_.Alignment()); + } + } +} + +Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +Status WinWritableFile::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(filename_, hFile_, spaceToReserve); +} + +WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, + size_t capacity, const EnvOptions& options) + : filename_(fname), + hFile_(hFile), + use_os_buffer_(options.use_os_buffer), + alignment_(alignment), + filesize_(0), + reservedsize_(0) { + assert(!options.use_mmap_writes); +} + +WinWritableFile::~WinWritableFile() { + if (NULL != hFile_ && INVALID_HANDLE_VALUE != hFile_) { + WinWritableFile::Close(); + } +} + + // Indicates if the class makes use of unbuffered I/O +bool WinWritableFile::UseOSBuffer() const { + return use_os_buffer_; +} + +size_t WinWritableFile::GetRequiredBufferAlignment() const { + return alignment_; +} + +Status WinWritableFile::Append(const Slice& data) { + + // Used for buffered access ONLY + assert(use_os_buffer_); + assert(data.size() < std::numeric_limits::max()); + + Status s; + + DWORD bytesWritten = 0; + if (!WriteFile(hFile_, data.data(), + static_cast(data.size()), &bytesWritten, NULL)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to WriteFile: " + filename_, + lastError); + } else { + assert(size_t(bytesWritten) == data.size()); + filesize_ += data.size(); + } + + return s; +} + +Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { + Status s; + + SSIZE_T ret = pwrite(hFile_, data.data(), data.size(), offset); + + // Error break + if (ret < 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to pwrite for: " + filename_, lastError); + } else { + // With positional write it is not clear at all + // if this actually extends the filesize + assert(size_t(ret) == data.size()); + filesize_ += data.size(); + } + return s; +} + + // Need to implement this so the file is truncated correctly + // when buffered and unbuffered mode +Status WinWritableFile::Truncate(uint64_t size) { + Status s = ftruncate(filename_, hFile_, size); + if (s.ok()) { + filesize_ = size; + } + return s; +} + +Status WinWritableFile::Close() { + + Status s; + + assert(INVALID_HANDLE_VALUE != hFile_); + + if (fsync(hFile_) < 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_, + lastError); + } + + if (FALSE == ::CloseHandle(hFile_)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_, + lastError); + } + + hFile_ = INVALID_HANDLE_VALUE; + return s; +} + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter +Status WinWritableFile::Flush() { + return Status::OK(); +} + +Status WinWritableFile::Sync() { + Status s; + // Calls flush buffers + if (fsync(hFile_) < 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_, + lastError); + } + return s; +} + +Status WinWritableFile::Fsync() { return Sync(); } + +uint64_t WinWritableFile::GetFileSize() { + // Double accounting now here with WritableFileWriter + // and this size will be wrong when unbuffered access is used + // but tests implement their own writable files and do not use WritableFileWrapper + // so we need to squeeze a square peg through + // a round hole here. + return filesize_; +} + +Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) { + Status status; + TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(offset + len, alignment_); + // Nothing to do + if (spaceToReserve <= reservedsize_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reservedsize_ = spaceToReserve; + } + return status; +} + +size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +Status WinDirectory::Fsync() { return Status::OK(); } + +WinFileLock::~WinFileLock() { + BOOL ret = ::CloseHandle(hFile_); + assert(ret); +} + + +} +} + diff --git a/external/rocksdb/port/win/io_win.h b/external/rocksdb/port/win/io_win.h new file mode 100644 index 0000000000..8c3a4ba7a6 --- /dev/null +++ b/external/rocksdb/port/win/io_win.h @@ -0,0 +1,359 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "util/aligned_buffer.h" + +#include +#include + +#include + +#include + +namespace rocksdb { +namespace port { + +std::string GetWindowsErrSz(DWORD err); + +inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { + return Status::IOError(context, GetWindowsErrSz(err)); +} + +inline Status IOErrorFromLastWindowsError(const std::string& context) { + return IOErrorFromWindowsError(context, GetLastError()); +} + +inline Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +// Note the below two do not set errno because they are used only here in this +// file +// on a Windows handle and, therefore, not necessary. Translating GetLastError() +// to errno +// is a sad business +inline int fsync(HANDLE hFile) { + if (!FlushFileBuffers(hFile)) { + return -1; + } + + return 0; +} + +SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, + uint64_t offset); + +SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset); + +Status fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size); + +Status ftruncate(const std::string& filename, HANDLE hFile, + uint64_t toSize); + + +size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); + +// mmap() based random-access +class WinMmapReadableFile : public RandomAccessFile { + const std::string fileName_; + HANDLE hFile_; + HANDLE hMap_; + + const void* mapped_region_; + const size_t length_; + +public: + // mapped_region_[0,length-1] contains the mmapped contents of the file. + WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, + const void* mapped_region, size_t length); + + ~WinMmapReadableFile(); + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +// We preallocate and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class WinMmapFile : public WritableFile { +private: + const std::string filename_; + HANDLE hFile_; + HANDLE hMap_; + + const size_t page_size_; // We flush the mapping view in page_size + // increments. We may decide if this is a memory + // page size or SSD page size + const size_t + allocation_granularity_; // View must start at such a granularity + + size_t reserved_size_; // Preallocated size + + size_t mapping_size_; // The max size of the mapping object + // we want to guess the final file size to minimize the remapping + size_t view_size_; // How much memory to map into a view at a time + + char* mapped_begin_; // Must begin at the file offset that is aligned with + // allocation_granularity_ + char* mapped_end_; + char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_]) + char* last_sync_; // Where have we synced up to + + uint64_t file_offset_; // Offset of mapped_begin_ in file + + // Do we have unsynced writes? + bool pending_sync_; + + // Can only truncate or reserve to a sector size aligned if + // used on files that are opened with Unbuffered I/O + Status TruncateFile(uint64_t toSize); + + Status UnmapCurrentRegion(); + + Status MapNewRegion(); + + virtual Status PreallocateInternal(uint64_t spaceToReserve); + +public: + + WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, + size_t allocation_granularity, const EnvOptions& options); + + ~WinMmapFile(); + + virtual Status Append(const Slice& data) override; + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual Status Truncate(uint64_t size) override; + + virtual Status Close() override; + + virtual Status Flush() override; + + // Flush only data + virtual Status Sync() override; + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() override; + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + virtual uint64_t GetFileSize() override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual Status Allocate(uint64_t offset, uint64_t len) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinSequentialFile : public SequentialFile { +private: + const std::string filename_; + HANDLE file_; + + // There is no equivalent of advising away buffered pages as in posix. + // To implement this flag we would need to do unbuffered reads which + // will need to be aligned (not sure there is a guarantee that the buffer + // passed in is aligned). + // Hence we currently ignore this flag. It is used only in a few cases + // which should not be perf critical. + // If perf evaluation finds this to be a problem, we can look into + // implementing this. + bool use_os_buffer_; + +public: + WinSequentialFile(const std::string& fname, HANDLE f, + const EnvOptions& options); + + ~WinSequentialFile(); + + virtual Status Read(size_t n, Slice* result, char* scratch) override; + + virtual Status Skip(uint64_t n) override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +// pread() based random-access +class WinRandomAccessFile : public RandomAccessFile { + const std::string filename_; + HANDLE hFile_; + const bool use_os_buffer_; + bool read_ahead_; + const size_t compaction_readahead_size_; + const size_t random_access_max_buffer_size_; + mutable std::mutex buffer_mut_; + mutable AlignedBuffer buffer_; + mutable uint64_t + buffered_start_; // file offset set that is currently buffered + + /* + * The function reads a requested amount of bytes into the specified aligned + * buffer Upon success the function sets the length of the buffer to the + * amount of bytes actually read even though it might be less than actually + * requested. It then copies the amount of bytes requested by the user (left) + * to the user supplied buffer (dest) and reduces left by the amount of bytes + * copied to the user buffer + * + * @user_offset [in] - offset on disk where the read was requested by the user + * @first_page_start [in] - actual page aligned disk offset that we want to + * read from + * @bytes_to_read [in] - total amount of bytes that will be read from disk + * which is generally greater or equal to the amount + * that the user has requested due to the + * either alignment requirements or read_ahead in + * effect. + * @left [in/out] total amount of bytes that needs to be copied to the user + * buffer. It is reduced by the amount of bytes that actually + * copied + * @buffer - buffer to use + * @dest - user supplied buffer + */ + SSIZE_T ReadIntoBuffer(uint64_t user_offset, uint64_t first_page_start, + size_t bytes_to_read, size_t& left, + AlignedBuffer& buffer, char* dest) const; + + SSIZE_T ReadIntoOneShotBuffer(uint64_t user_offset, uint64_t first_page_start, + size_t bytes_to_read, size_t& left, + char* dest) const; + + SSIZE_T ReadIntoInstanceBuffer(uint64_t user_offset, + uint64_t first_page_start, + size_t bytes_to_read, size_t& left, + char* dest) const; + + void CalculateReadParameters(uint64_t offset, size_t bytes_requested, + size_t& actual_bytes_toread, + uint64_t& first_page_start) const; + + // Override for behavior change + virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset) const; + +public: + WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, + const EnvOptions& options); + + ~WinRandomAccessFile(); + + virtual void EnableReadAhead() override; + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual bool ShouldForwardRawRequest() const override; + + virtual void Hint(AccessPattern pattern) override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + + +// This is a sequential write class. It has been mimicked (as others) after +// the original Posix class. We add support for unbuffered I/O on windows as +// well +// we utilize the original buffer as an alignment buffer to write directly to +// file with no buffering. +// No buffering requires that the provided buffer is aligned to the physical +// sector size (SSD page size) and +// that all SetFilePointer() operations to occur with such an alignment. +// We thus always write in sector/page size increments to the drive and leave +// the tail for the next write OR for Close() at which point we pad with zeros. +// No padding is required for +// buffered access. +class WinWritableFile : public WritableFile { +private: + const std::string filename_; + HANDLE hFile_; + const bool use_os_buffer_; // Used to indicate unbuffered access, the file + const uint64_t alignment_; + // must be opened as unbuffered if false + uint64_t filesize_; // How much data is actually written disk + uint64_t reservedsize_; // how far we have reserved space + + virtual Status PreallocateInternal(uint64_t spaceToReserve); + +public: + WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, + size_t capacity, const EnvOptions& options); + + ~WinWritableFile(); + + // Indicates if the class makes use of unbuffered I/O + virtual bool UseOSBuffer() const override; + + virtual size_t GetRequiredBufferAlignment() const override; + + virtual Status Append(const Slice& data) override; + + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; + + // Need to implement this so the file is truncated correctly + // when buffered and unbuffered mode + virtual Status Truncate(uint64_t size) override; + + virtual Status Close() override; + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter + virtual Status Flush() override; + + virtual Status Sync() override; + + virtual Status Fsync() override; + + virtual uint64_t GetFileSize() override; + + virtual Status Allocate(uint64_t offset, uint64_t len) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinDirectory : public Directory { +public: + WinDirectory() {} + + virtual Status Fsync() override; +}; + +class WinFileLock : public FileLock { +public: + explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { + assert(hFile != NULL); + assert(hFile != INVALID_HANDLE_VALUE); + } + + ~WinFileLock(); + +private: + HANDLE hFile_; +}; + +} +} diff --git a/external/rocksdb/port/win/port_win.cc b/external/rocksdb/port/win/port_win.cc index 2aaeada924..dd87c35770 100644 --- a/external/rocksdb/port/win/port_win.cc +++ b/external/rocksdb/port/win/port_win.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -39,69 +39,55 @@ void gettimeofday(struct timeval* tv, struct timezone* /* tz */) { seconds secNow(duration_cast(usNow)); - tv->tv_sec = secNow.count(); - tv->tv_usec = usNow.count() - duration_cast(secNow).count(); + tv->tv_sec = static_cast(secNow.count()); + tv->tv_usec = static_cast(usNow.count() - + duration_cast(secNow).count()); } -Mutex::Mutex(bool adaptive) : lock(m_mutex, std::defer_lock) {} - Mutex::~Mutex() {} -void Mutex::Lock() { - lock.lock(); -#ifndef NDEBUG - locked_ = true; -#endif -} - -void Mutex::Unlock() { -#ifndef NDEBUG - locked_ = false; -#endif - lock.unlock(); -} - -void Mutex::AssertHeld() { -#ifndef NDEBUG - assert(locked_); -#endif -} - -CondVar::CondVar(Mutex* mu) : mu_(mu) {} - CondVar::~CondVar() {} void CondVar::Wait() { + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock lk(mu_->getLock(), std::adopt_lock); #ifndef NDEBUG mu_->locked_ = false; #endif - cv_.wait(mu_->getLock()); + cv_.wait(lk); #ifndef NDEBUG mu_->locked_ = true; #endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); } bool CondVar::TimedWait(uint64_t abs_time_us) { -#ifndef NDEBUG - mu_->locked_ = false; -#endif using namespace std::chrono; // MSVC++ library implements wait_until in terms of wait_for so - // there is not an absolute wait anyway. + // we need to convert absolute wait into relative wait. microseconds usAbsTime(abs_time_us); microseconds usNow( - duration_cast(system_clock::now().time_since_epoch())); + duration_cast(system_clock::now().time_since_epoch())); microseconds relTimeUs = - (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero(); - - std::cv_status cvStatus = cv_.wait_for(mu_->getLock(), relTimeUs); + (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero(); + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock lk(mu_->getLock(), std::adopt_lock); +#ifndef NDEBUG + mu_->locked_ = false; +#endif + std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs); #ifndef NDEBUG mu_->locked_ = true; #endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); if (cvStatus == std::cv_status::timeout) { return true; @@ -114,8 +100,10 @@ void CondVar::Signal() { cv_.notify_one(); } void CondVar::SignalAll() { cv_.notify_all(); } +int PhysicalCoreID() { return GetCurrentProcessorNumber(); } + void InitOnce(OnceType* once, void (*initializer)()) { - std::call_once(*once, initializer); + std::call_once(once->flag_, initializer); } // Private structure, exposed only by pointer @@ -245,6 +233,8 @@ int GetMaxOpenFiles() { return -1; } #include "jemalloc/jemalloc.h" +#ifndef JEMALLOC_NON_INIT + namespace rocksdb { namespace port { @@ -290,6 +280,8 @@ JEMALLOC_SECTION(".CRT$XCT") JEMALLOC_ATTR(used) static const void( } // extern "C" +#endif // JEMALLOC_NON_INIT + // Global operators to be replaced by a linker void* operator new(size_t size) { diff --git a/external/rocksdb/port/win/port_win.h b/external/rocksdb/port/win/port_win.h index 1f517fb784..54f10a24c9 100644 --- a/external/rocksdb/port/win/port_win.h +++ b/external/rocksdb/port/win/port_win.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -41,11 +42,6 @@ #define strcasecmp _stricmp #endif -// defined in stdio.h -#ifndef snprintf -#define snprintf _snprintf -#endif - #undef GetCurrentTime #undef DeleteFile @@ -58,27 +54,8 @@ typedef SSIZE_T ssize_t; #define ROCKSDB_PRIszt "Iu" #endif -#define ROCKSDB_NOEXCEPT - #define __attribute__(A) -#ifdef ZLIB -#include -#endif - -#ifdef BZIP2 -#include -#endif - -#if defined(LZ4) -#include -#include -#endif - -#ifdef SNAPPY -#include -#endif - // Thread local storage on Linux // There is thread_local in C++11 #ifndef __thread @@ -92,50 +69,96 @@ typedef SSIZE_T ssize_t; namespace rocksdb { #define PREFETCH(addr, rw, locality) -std::string GetWindowsErrSz(DWORD err); namespace port { +// VS 15 +#if (defined _MSC_VER) && (_MSC_VER >= 1900) + +#define ROCKSDB_NOEXCEPT noexcept + // For use at db/file_indexer.h kLevelMaxIndex -const int kMaxInt32 = INT32_MAX; -const uint64_t kMaxUint64 = UINT64_MAX; +const int kMaxInt32 = std::numeric_limits::max(); +const uint64_t kMaxUint64 = std::numeric_limits::max(); +const int64_t kMaxInt64 = std::numeric_limits::max(); + +const size_t kMaxSizet = std::numeric_limits::max(); + +#else //_MSC_VER + +// VS 15 has snprintf +#define snprintf _snprintf + +#define ROCKSDB_NOEXCEPT // std::numeric_limits::max() is not constexpr just yet // therefore, use the same limits + +// For use at db/file_indexer.h kLevelMaxIndex +const int kMaxInt32 = INT32_MAX; +const int64_t kMaxInt64 = INT64_MAX; +const uint64_t kMaxUint64 = UINT64_MAX; + #ifdef _WIN64 const size_t kMaxSizet = UINT64_MAX; #else const size_t kMaxSizet = UINT_MAX; #endif +#endif //_MSC_VER + const bool kLittleEndian = true; class CondVar; class Mutex { public: - /* implicit */ Mutex(bool adaptive = false); + + /* implicit */ Mutex(bool adaptive = false) +#ifndef NDEBUG + : locked_(false) +#endif + { } + ~Mutex(); - void Lock(); - void Unlock(); + void Lock() { + mutex_.lock(); +#ifndef NDEBUG + locked_ = true; +#endif + } + + void Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + mutex_.unlock(); + } // this will assert if the mutex is not locked // it does NOT verify that mutex is held by a calling thread - void AssertHeld(); + void AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif + } - std::unique_lock& getLock() { return lock; } + // Mutex is move only with lock ownership transfer + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; private: + friend class CondVar; - std::mutex m_mutex; - std::unique_lock lock; + + std::mutex& getLock() { + return mutex_; + } + + std::mutex mutex_; #ifndef NDEBUG bool locked_; #endif - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); }; class RWMutex { @@ -162,30 +185,56 @@ class RWMutex { class CondVar { public: - explicit CondVar(Mutex* mu); + explicit CondVar(Mutex* mu) : mu_(mu) { + } + ~CondVar(); void Wait(); bool TimedWait(uint64_t expiration_time); void Signal(); void SignalAll(); + // Condition var is not copy/move constructible + CondVar(const CondVar&) = delete; + CondVar& operator=(const CondVar&) = delete; + + CondVar(CondVar&&) = delete; + CondVar& operator=(CondVar&&) = delete; + private: std::condition_variable cv_; Mutex* mu_; }; -typedef std::once_flag OnceType; -#define LEVELDB_ONCE_INIT std::once_flag::once_flag(); + +// OnceInit type helps emulate +// Posix semantics with initialization +// adopted in the project +struct OnceType { + + struct Init {}; + + OnceType() {} + OnceType(const Init&) {} + OnceType(const OnceType&) = delete; + OnceType& operator=(const OnceType&) = delete; + + std::once_flag flag_; +}; + +#define LEVELDB_ONCE_INIT port::OnceType::Init() extern void InitOnce(OnceType* once, void (*initializer)()); #define CACHE_LINE_SIZE 64U -#ifdef min -#undef min -#endif -#ifdef max -#undef max +static inline void AsmVolatilePause() { +#if defined(_M_IX86) || defined(_M_X64) + YieldProcessor(); #endif + // it would be nice to get "wfe" on ARM here +} + +extern int PhysicalCoreID(); // For Thread Local Storage abstraction typedef DWORD pthread_key_t; @@ -247,4 +296,4 @@ using port::truncate; } // namespace rocksdb -#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#endif // STORAGE_LEVELDB_PORT_PORT_WIN_H_ diff --git a/external/rocksdb/port/win/win_logger.cc b/external/rocksdb/port/win/win_logger.cc index e91930dffa..3c4ae1f88c 100644 --- a/external/rocksdb/port/win/win_logger.cc +++ b/external/rocksdb/port/win/win_logger.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,7 +10,9 @@ // Logger implementation that can be shared by all environments // where enough posix functionality is available. -#include +#include "port/win/win_logger.h" +#include "port/win/io_win.h" + #include #include #include @@ -19,14 +21,13 @@ #include "rocksdb/env.h" -#include - -#include "port/win/win_logger.h" #include "port/sys_time.h" #include "util/iostats_context_imp.h" namespace rocksdb { +namespace port { + WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, const InfoLogLevel log_level) : Logger(log_level), @@ -53,8 +54,9 @@ void WinLogger::close() { CloseHandle(file_); } void WinLogger::Flush() { if (flush_pending_) { flush_pending_ = false; - // With Windows API writes go to OS buffers directly so no fflush needed unlike - // with C runtime API. We don't flush all the way to disk for perf reasons. + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. } last_flush_micros_ = env_->NowMicros(); @@ -124,15 +126,16 @@ void WinLogger::Logv(const char* format, va_list ap) { assert(p <= limit); const size_t write_size = p - base; - DWORD bytesWritten = 0; - BOOL ret = WriteFile(file_, base, write_size, &bytesWritten, NULL); + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, base, static_cast(write_size), + &bytesWritten, NULL); if (ret == FALSE) { std::string errSz = GetWindowsErrSz(GetLastError()); fprintf(stderr, errSz.c_str()); } flush_pending_ = true; - assert(bytesWritten == write_size); + assert((bytesWritten == write_size) || (ret == FALSE)); if (bytesWritten > 0) { log_size_ += write_size; } @@ -141,8 +144,9 @@ void WinLogger::Logv(const char* format, va_list ap) { static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { flush_pending_ = false; - // With Windows API writes go to OS buffers directly so no fflush needed unlike - // with C runtime API. We don't flush all the way to disk for perf reasons. + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. last_flush_micros_ = now_micros; } break; @@ -151,4 +155,6 @@ void WinLogger::Logv(const char* format, va_list ap) { size_t WinLogger::GetLogFileSize() const { return log_size_; } +} + } // namespace rocksdb diff --git a/external/rocksdb/port/win/win_logger.h b/external/rocksdb/port/win/win_logger.h index 67e45907f9..84971363df 100644 --- a/external/rocksdb/port/win/win_logger.h +++ b/external/rocksdb/port/win/win_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,11 +16,14 @@ #include "rocksdb/env.h" +#include +#include + namespace rocksdb { class Env; -const int kDebugLogChunkSize = 128 * 1024; +namespace port { class WinLogger : public rocksdb::Logger { public: @@ -54,4 +57,6 @@ class WinLogger : public rocksdb::Logger { const static uint64_t flush_every_seconds_ = 5; }; +} + } // namespace rocksdb diff --git a/external/rocksdb/port/win/xpress_win.cc b/external/rocksdb/port/win/xpress_win.cc new file mode 100644 index 0000000000..a0206b5780 --- /dev/null +++ b/external/rocksdb/port/win/xpress_win.cc @@ -0,0 +1,267 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/win/xpress_win.h" +#include + +#include +#include +#include +#include + +#ifdef XPRESS + +#ifdef JEMALLOC +#include +#endif + +// Put this under ifdef so windows systems w/o this +// can still build +#include + +namespace rocksdb { +namespace port { +namespace xpress { + +// Helpers +namespace { + +auto CloseCompressorFun = [](void* h) { + if (NULL != h) { + ::CloseCompressor(reinterpret_cast(h)); + } +}; + +auto CloseDecompressorFun = [](void* h) { + if (NULL != h) { + ::CloseDecompressor(reinterpret_cast(h)); + } +}; + + +#ifdef JEMALLOC +// Make sure compressors use our jemalloc if redirected +PVOID CompressorAlloc(PVOID, SIZE_T size) { + return je_malloc(size); +} + +VOID CompressorFree(PVOID, PVOID p) { + if (p != NULL) { + je_free(p); + } +} + +#endif + +} + +bool Compress(const char* input, size_t length, std::string* output) { + + assert(input != nullptr); + assert(output != nullptr); + + if (length == 0) { + output->clear(); + return true; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + +#ifdef JEMALLOC + COMPRESS_ALLOCATION_ROUTINES allocationRoutines; + + // Init. allocation routines + allocationRoutines.Allocate = CompressorAlloc; + allocationRoutines.Free = CompressorFree; + allocationRoutines.UserContext = NULL; + + allocRoutinesPtr = &allocationRoutines; +#endif + + COMPRESSOR_HANDLE compressor = NULL; + + BOOL success = CreateCompressor( + COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &compressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Compressor LastError: " << + GetLastError() << std::endl; +#endif + return false; + } + + std::unique_ptr + compressorGuard(compressor, CloseCompressorFun); + + SIZE_T compressedBufferSize = 0; + + // Query compressed buffer size. + success = ::Compress( + compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + NULL, // Compressed Buffer + 0, // Compressed Buffer size + &compressedBufferSize); // Compressed Data size + + if (!success) { + + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr << + "XPRESS: Failed to estimate compressed buffer size LastError " << + lastError << std::endl; +#endif + return false; + } + } + + assert(compressedBufferSize > 0); + + std::string result; + result.resize(compressedBufferSize); + + SIZE_T compressedDataSize = 0; + + // Compress + success = ::Compress( + compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + &result[0], // Compressed Buffer + compressedBufferSize, // Compressed Buffer size + &compressedDataSize); // Compressed Data size + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to compress LastError " << + GetLastError() << std::endl; +#endif + return false; + } + + result.resize(compressedDataSize); + output->swap(result); + + return true; +} + +char* Decompress(const char* input_data, size_t input_length, + int* decompress_size) { + + assert(input_data != nullptr); + assert(decompress_size != nullptr); + + if (input_length == 0) { + return nullptr; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + +#ifdef JEMALLOC + COMPRESS_ALLOCATION_ROUTINES allocationRoutines; + + // Init. allocation routines + allocationRoutines.Allocate = CompressorAlloc; + allocationRoutines.Free = CompressorFree; + allocationRoutines.UserContext = NULL; + allocRoutinesPtr = &allocationRoutines; +#endif + + DECOMPRESSOR_HANDLE decompressor = NULL; + + BOOL success = CreateDecompressor( + COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &decompressor); // Handle + + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Decompressor LastError " + << GetLastError() << std::endl; +#endif + return nullptr; + } + + std::unique_ptr + compressorGuard(decompressor, CloseDecompressorFun); + + SIZE_T decompressedBufferSize = 0; + + success = ::Decompress( + decompressor, // Compressor Handle + const_cast(input_data), // Compressed data + input_length, // Compressed data size + NULL, // Buffer set to NULL + 0, // Buffer size set to 0 + &decompressedBufferSize); // Decompressed Data size + + if (!success) { + + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr + << "XPRESS: Failed to estimate decompressed buffer size LastError " + << lastError << std::endl; +#endif + return nullptr; + } + } + + assert(decompressedBufferSize > 0); + + // On Windows we are limited to a 32-bit int for the + // output data size argument + // so we hopefully never get here + if (decompressedBufferSize > std::numeric_limits::max()) { + assert(false); + return nullptr; + } + + // The callers are deallocating using delete[] + // thus we must allocate with new[] + std::unique_ptr outputBuffer(new char[decompressedBufferSize]); + + SIZE_T decompressedDataSize = 0; + + success = ::Decompress( + decompressor, + const_cast(input_data), + input_length, + outputBuffer.get(), + decompressedBufferSize, + &decompressedDataSize); + + if (!success) { +#ifdef _DEBUG + std::cerr << + "XPRESS: Failed to decompress LastError " << + GetLastError() << std::endl; +#endif + return nullptr; + } + + *decompress_size = static_cast(decompressedDataSize); + + // Return the raw buffer to the caller supporting the tradition + return outputBuffer.release(); +} +} +} +} + +#endif diff --git a/external/rocksdb/port/win/xpress_win.h b/external/rocksdb/port/win/xpress_win.h new file mode 100644 index 0000000000..7d1cbb68b3 --- /dev/null +++ b/external/rocksdb/port/win/xpress_win.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +namespace rocksdb { +namespace port { +namespace xpress { + +bool Compress(const char* input, size_t length, std::string* output); + +char* Decompress(const char* input_data, size_t input_length, + int* decompress_size); + +} +} +} + diff --git a/external/rocksdb/port/xpress.h b/external/rocksdb/port/xpress.h new file mode 100644 index 0000000000..db023a2d9e --- /dev/null +++ b/external/rocksdb/port/xpress.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +// Xpress on Windows is implemeted using Win API +#if defined(ROCKSDB_PLATFORM_POSIX) +#error "Xpress compression not implemented" +#elif defined(OS_WIN) +#include "port/win/xpress_win.h" +#endif diff --git a/external/rocksdb/src.mk b/external/rocksdb/src.mk index 5e9ed16447..e647e2b3cb 100644 --- a/external/rocksdb/src.mk +++ b/external/rocksdb/src.mk @@ -1,5 +1,6 @@ # These are the sources from which librocksdb.a is built: LIB_SOURCES = \ + db/auto_roll_logger.cc \ db/builder.cc \ db/c.cc \ db/column_family.cc \ @@ -15,6 +16,8 @@ LIB_SOURCES = \ db/db_impl_debug.cc \ db/db_impl_readonly.cc \ db/db_impl_experimental.cc \ + db/db_impl_add_file.cc \ + db/db_info_dumper.cc \ db/db_iter.cc \ db/experimental.cc \ db/event_helpers.cc \ @@ -33,7 +36,6 @@ LIB_SOURCES = \ db/merge_helper.cc \ db/merge_operator.cc \ db/repair.cc \ - db/slice.cc \ db/snapshot_impl.cc \ db/table_cache.cc \ db/table_properties_collector.cc \ @@ -46,6 +48,12 @@ LIB_SOURCES = \ db/write_batch_base.cc \ db/write_controller.cc \ db/write_thread.cc \ + db/xfunc_test_points.cc \ + memtable/hash_cuckoo_rep.cc \ + memtable/hash_linklist_rep.cc \ + memtable/hash_skiplist_rep.cc \ + memtable/skiplistrep.cc \ + memtable/vectorrep.cc \ port/stack_trace.cc \ port/port_posix.cc \ table/adaptive_table_factory.cc \ @@ -55,7 +63,6 @@ LIB_SOURCES = \ table/block_based_table_reader.cc \ table/block_builder.cc \ table/block.cc \ - table/block_hash_index.cc \ table/block_prefix_index.cc \ table/bloom_block.cc \ table/cuckoo_table_builder.cc \ @@ -74,34 +81,38 @@ LIB_SOURCES = \ table/plain_table_index.cc \ table/plain_table_key_coding.cc \ table/plain_table_reader.cc \ + table/persistent_cache_helper.cc \ table/table_properties.cc \ table/two_level_iterator.cc \ tools/dump/db_dump_tool.cc \ util/arena.cc \ - util/auto_roll_logger.cc \ util/bloom.cc \ util/build_version.cc \ - util/cache.cc \ util/coding.cc \ util/comparator.cc \ util/compaction_job_stats_impl.cc \ + util/concurrent_arena.cc \ util/crc32c.cc \ - util/db_info_dumper.cc \ - util/delete_scheduler_impl.cc \ + util/delete_scheduler.cc \ util/dynamic_bloom.cc \ util/env.cc \ + util/env_chroot.cc \ util/env_hdfs.cc \ util/env_posix.cc \ util/file_util.cc \ util/file_reader_writer.cc \ util/filter_policy.cc \ util/hash.cc \ - util/hash_cuckoo_rep.cc \ - util/hash_linklist_rep.cc \ - util/hash_skiplist_rep.cc \ util/histogram.cc \ + util/histogram_windowing.cc \ util/instrumented_mutex.cc \ util/iostats_context.cc \ + util/io_posix.cc \ + util/lru_cache.cc \ + util/threadpool.cc \ + util/transaction_test_util.cc \ + util/sharded_cache.cc \ + util/sst_file_manager_impl.cc \ utilities/backupable/backupable_db.cc \ utilities/convenience/info_log_finder.cc \ utilities/checkpoint/checkpoint.cc \ @@ -109,14 +120,25 @@ LIB_SOURCES = \ utilities/document/document_db.cc \ utilities/document/json_document_builder.cc \ utilities/document/json_document.cc \ + utilities/env_mirror.cc \ + utilities/env_registry.cc \ utilities/flashcache/flashcache.cc \ utilities/geodb/geodb_impl.cc \ utilities/leveldb_options/leveldb_options.cc \ + utilities/memory/memory_util.cc \ utilities/merge_operators/put.cc \ + utilities/merge_operators/max.cc \ utilities/merge_operators/string_append/stringappend2.cc \ utilities/merge_operators/string_append/stringappend.cc \ utilities/merge_operators/uint64add.cc \ + utilities/option_change_migration/option_change_migration.cc \ + utilities/options/options_util.cc \ + utilities/persistent_cache/persistent_cache_tier.cc \ + utilities/persistent_cache/volatile_tier_impl.cc \ + utilities/persistent_cache/block_cache_tier_file.cc \ + utilities/persistent_cache/block_cache_tier_metadata.cc \ utilities/redis/redis_lists.cc \ + utilities/simulator_cache/sim_cache.cc \ utilities/spatialdb/spatial_db.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/transactions/optimistic_transaction_impl.cc \ @@ -136,14 +158,14 @@ LIB_SOURCES = \ util/memenv.cc \ util/murmurhash.cc \ util/mutable_cf_options.cc \ - util/options_builder.cc \ util/options.cc \ util/options_helper.cc \ util/options_parser.cc \ + util/options_sanity_check.cc \ util/perf_context.cc \ util/perf_level.cc \ + util/random.cc \ util/rate_limiter.cc \ - util/skiplistrep.cc \ util/slice.cc \ util/statistics.cc \ util/status.cc \ @@ -156,21 +178,30 @@ LIB_SOURCES = \ util/thread_status_updater_debug.cc \ util/thread_status_util.cc \ util/thread_status_util_debug.cc \ - util/vectorrep.cc \ util/xfunc.cc \ util/xxhash.cc \ -TOOL_SOURCES = \ - util/ldb_cmd.cc \ - util/ldb_tool.cc \ - util/sst_dump_tool.cc \ +TOOL_LIB_SOURCES = \ + tools/ldb_cmd.cc \ + tools/ldb_tool.cc \ + tools/sst_dump_tool.cc \ -MOCK_SOURCES = \ +MOCK_LIB_SOURCES = \ table/mock_table.cc \ - util/mock_env.cc + util/mock_env.cc \ + util/fault_injection_test_env.cc -TEST_BENCH_SOURCES = \ +BENCH_LIB_SOURCES = \ + tools/db_bench_tool.cc + +TEST_LIB_SOURCES = \ + util/testharness.cc \ + util/testutil.cc \ + db/db_test_util.cc + +MAIN_SOURCES = \ third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + db/auto_roll_logger_test.cc \ db/column_family_test.cc \ db/compaction_job_test.cc \ db/compaction_job_stats_test.cc \ @@ -178,28 +209,37 @@ TEST_BENCH_SOURCES = \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \ - db/db_bench.cc \ db/dbformat_test.cc \ db/db_iter_test.cc \ db/db_test.cc \ + db/db_block_cache_test.cc \ + db/db_io_failure_test.cc \ + db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ db/db_dynamic_level_test.cc \ + db/db_flush_test.cc \ db/db_inplace_update_test.cc \ + db/db_iterator_test.cc \ db/db_log_iter_test.cc \ - db/db_universal_compaction_test.cc \ + db/db_options_test.cc \ + db/db_sst_test.cc \ db/db_tailing_iter_test.cc \ + db/db_universal_compaction_test.cc \ db/db_wal_test.cc \ + db/db_table_properties_test.cc \ db/deletefile_test.cc \ db/fault_injection_test.cc \ db/file_indexer_test.cc \ db/filename_test.cc \ db/flush_job_test.cc \ + db/inlineskiplist_test.cc \ db/listener_test.cc \ - db/log_and_apply_bench.cc \ db/log_test.cc \ + db/manual_compaction_test.cc \ db/memtablerep_bench.cc \ db/merge_test.cc \ + db/options_file_test.cc \ db/perf_context_test.cc \ db/plain_table_db_test.cc \ db/prefix_test.cc \ @@ -213,7 +253,6 @@ TEST_BENCH_SOURCES = \ db/write_controller_test.cc \ db/write_callback_test.cc \ table/block_based_filter_block_test.cc \ - table/block_hash_index_test.cc \ table/block_test.cc \ table/cuckoo_table_builder_test.cc \ table/cuckoo_table_reader_test.cc \ @@ -221,50 +260,52 @@ TEST_BENCH_SOURCES = \ table/merger_test.cc \ table/table_reader_bench.cc \ table/table_test.cc \ + tools/db_bench.cc \ + tools/db_bench_tool_test.cc \ tools/db_sanity_test.cc \ + tools/ldb_cmd_test.cc \ tools/reduce_levels_test.cc \ + tools/sst_dump_test.cc \ util/arena_test.cc \ - util/auto_roll_logger_test.cc \ util/autovector_test.cc \ - util/benchharness.cc \ - util/benchharness_test.cc \ util/bloom_test.cc \ util/cache_bench.cc \ util/cache_test.cc \ util/coding_test.cc \ util/crc32c_test.cc \ - util/db_test_util.cc \ util/dynamic_bloom_test.cc \ + util/env_basic_test.cc \ util/env_test.cc \ util/filelock_test.cc \ util/histogram_test.cc \ + util/statistics_test.cc \ utilities/backupable/backupable_db_test.cc \ utilities/checkpoint/checkpoint_test.cc \ utilities/document/document_db_test.cc \ utilities/document/json_document_test.cc \ + utilities/env_registry_test.cc \ utilities/geodb/geodb_test.cc \ + utilities/memory/memory_test.cc \ utilities/merge_operators/string_append/stringappend_test.cc \ + utilities/option_change_migration/option_change_migration_test.cc \ + utilities/options/options_util_test.cc \ utilities/redis/redis_lists_test.cc \ + utilities/simulator_cache/sim_cache_test.cc \ utilities/spatialdb/spatial_db_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ utilities/transactions/transaction_test.cc \ utilities/ttl/ttl_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \ + util/iostats_context_test.cc \ util/log_write_bench.cc \ - util/manual_compaction_test.cc \ - util/memenv_test.cc \ util/mock_env_test.cc \ util/options_test.cc \ util/event_logger_test.cc \ util/rate_limiter_test.cc \ util/slice_transform_test.cc \ - util/sst_dump_test.cc \ - util/testharness.cc \ - util/testutil.cc \ util/thread_list_test.cc \ - util/thread_local_test.cc \ - util/ldb_cmd_test.cc + util/thread_local_test.cc JNI_NATIVE_SOURCES = \ java/rocksjni/backupenginejni.cc \ diff --git a/external/rocksdb/table/adaptive_table_factory.cc b/external/rocksdb/table/adaptive_table_factory.cc index c589c07a97..bb19c417ed 100644 --- a/external/rocksdb/table/adaptive_table_factory.cc +++ b/external/rocksdb/table/adaptive_table_factory.cc @@ -43,7 +43,8 @@ extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const { + unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const { Footer footer; auto s = ReadFooterFromFile(file.get(), file_size, &footer); if (!s.ok()) { @@ -66,9 +67,10 @@ Status AdaptiveTableFactory::NewTableReader( } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, WritableFileWriter* file) const { - return table_factory_to_write_->NewTableBuilder(table_builder_options, file); + return table_factory_to_write_->NewTableBuilder(table_builder_options, + column_family_id, file); } std::string AdaptiveTableFactory::GetPrintableTableOptions() const { @@ -77,27 +79,30 @@ std::string AdaptiveTableFactory::GetPrintableTableOptions() const { const int kBufferSize = 200; char buffer[kBufferSize]; - if (!table_factory_to_write_) { + if (table_factory_to_write_) { snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n", - table_factory_to_write_->Name(), + (table_factory_to_write_->Name() ? table_factory_to_write_->Name() + : ""), table_factory_to_write_->GetPrintableTableOptions().c_str()); ret.append(buffer); } - if (!plain_table_factory_) { + if (plain_table_factory_) { snprintf(buffer, kBufferSize, " %s options:\n%s\n", - plain_table_factory_->Name(), + plain_table_factory_->Name() ? plain_table_factory_->Name() : "", plain_table_factory_->GetPrintableTableOptions().c_str()); ret.append(buffer); } - if (!block_based_table_factory_) { - snprintf(buffer, kBufferSize, " %s options:\n%s\n", - block_based_table_factory_->Name(), - block_based_table_factory_->GetPrintableTableOptions().c_str()); + if (block_based_table_factory_) { + snprintf( + buffer, kBufferSize, " %s options:\n%s\n", + (block_based_table_factory_->Name() ? block_based_table_factory_->Name() + : ""), + block_based_table_factory_->GetPrintableTableOptions().c_str()); ret.append(buffer); } - if (!cuckoo_table_factory_) { + if (cuckoo_table_factory_) { snprintf(buffer, kBufferSize, " %s options:\n%s\n", - cuckoo_table_factory_->Name(), + cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "", cuckoo_table_factory_->GetPrintableTableOptions().c_str()); ret.append(buffer); } diff --git a/external/rocksdb/table/adaptive_table_factory.h b/external/rocksdb/table/adaptive_table_factory.h index dfcae14cf4..b7b52ba96f 100644 --- a/external/rocksdb/table/adaptive_table_factory.h +++ b/external/rocksdb/table/adaptive_table_factory.h @@ -33,14 +33,15 @@ class AdaptiveTableFactory : public TableFactory { const char* Name() const override { return "AdaptiveTableFactory"; } - Status NewTableReader(const TableReaderOptions& table_reader_options, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader( + const TableReaderOptions& table_reader_options, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table, + bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFileWriter* file) const override; + uint32_t column_family_id, WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/external/rocksdb/table/block.cc b/external/rocksdb/table/block.cc index 99c76f695b..fa14a00f02 100644 --- a/external/rocksdb/table/block.cc +++ b/external/rocksdb/table/block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,7 +18,6 @@ #include "rocksdb/comparator.h" #include "table/format.h" -#include "table/block_hash_index.h" #include "table/block_prefix_index.h" #include "util/coding.h" #include "util/logging.h" @@ -64,6 +63,40 @@ void BlockIter::Next() { void BlockIter::Prev() { assert(Valid()); + assert(prev_entries_idx_ == -1 || + static_cast(prev_entries_idx_) < prev_entries_.size()); + // Check if we can use cached prev_entries_ + if (prev_entries_idx_ > 0 && + prev_entries_[prev_entries_idx_].offset == current_) { + // Read cached CachedPrevEntry + prev_entries_idx_--; + const CachedPrevEntry& current_prev_entry = + prev_entries_[prev_entries_idx_]; + + const char* key_ptr = nullptr; + if (current_prev_entry.key_ptr != nullptr) { + // The key is not delta encoded and stored in the data block + key_ptr = current_prev_entry.key_ptr; + key_pinned_ = true; + } else { + // The key is delta encoded and stored in prev_entries_keys_buff_ + key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; + key_pinned_ = false; + } + const Slice current_key(key_ptr, current_prev_entry.key_size); + + current_ = current_prev_entry.offset; + key_.SetKey(current_key, false /* copy */); + value_ = current_prev_entry.value; + + return; + } + + // Clear prev entries cache + prev_entries_idx_ = -1; + prev_entries_.clear(); + prev_entries_keys_buff_.clear(); + // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { @@ -77,9 +110,28 @@ void BlockIter::Prev() { } SeekToRestartPoint(restart_index_); + do { + if (!ParseNextKey()) { + break; + } + Slice current_key = key(); + + if (key_.IsKeyPinned()) { + // The key is not delta encoded + prev_entries_.emplace_back(current_, current_key.data(), 0, + current_key.size(), value()); + } else { + // The key is delta encoded, cache decoded key in buffer + size_t new_key_offset = prev_entries_keys_buff_.size(); + prev_entries_keys_buff_.append(current_key.data(), current_key.size()); + + prev_entries_.emplace_back(current_, nullptr, new_key_offset, + current_key.size(), value()); + } // Loop until end of current entry hits the start of original entry - } while (ParseNextKey() && NextEntryOffset() < original); + } while (NextEntryOffset() < original); + prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; } void BlockIter::Seek(const Slice& target) { @@ -92,8 +144,7 @@ void BlockIter::Seek(const Slice& target) { if (prefix_index_) { ok = PrefixSeek(target, &index); } else { - ok = hash_index_ ? HashSeek(target, &index) - : BinarySeek(target, 0, num_restarts_ - 1, &index); + ok = BinarySeek(target, 0, num_restarts_ - 1, &index); } if (!ok) { @@ -153,7 +204,16 @@ bool BlockIter::ParseNextKey() { CorruptionError(); return false; } else { - key_.TrimAppend(shared, p, non_shared); + if (shared == 0) { + // If this key dont share any bytes with prev key then we dont need + // to decode it and can use it's address in the block directly. + key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; + } else { + // This key share `shared` bytes with prev key, we need to decode it + key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; + } value_ = Slice(p + non_shared, value_length); while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_) { @@ -266,21 +326,6 @@ bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, } } -bool BlockIter::HashSeek(const Slice& target, uint32_t* index) { - assert(hash_index_); - auto restart_index = hash_index_->GetRestartIndex(target); - if (restart_index == nullptr) { - current_ = restarts_; - return false; - } - - // the elements in restart_array[index : index + num_blocks] - // are all with same prefix. We'll do binary search in that small range. - auto left = restart_index->first_index; - auto right = restart_index->first_index + restart_index->num_blocks - 1; - return BinarySeek(target, left, right, index); -} - bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { assert(prefix_index_); uint32_t* block_ids = nullptr; @@ -316,14 +361,14 @@ Block::Block(BlockContents&& contents) } } -Iterator* Block::NewIterator( - const Comparator* cmp, BlockIter* iter, bool total_order_seek) { +InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter, + bool total_order_seek) { if (size_ < 2*sizeof(uint32_t)) { if (iter != nullptr) { iter->SetStatus(Status::Corruption("bad block contents")); return iter; } else { - return NewErrorIterator(Status::Corruption("bad block contents")); + return NewErrorInternalIterator(Status::Corruption("bad block contents")); } } const uint32_t num_restarts = NumRestarts(); @@ -332,39 +377,30 @@ Iterator* Block::NewIterator( iter->SetStatus(Status::OK()); return iter; } else { - return NewEmptyIterator(); + return NewEmptyInternalIterator(); } } else { - BlockHashIndex* hash_index_ptr = - total_order_seek ? nullptr : hash_index_.get(); BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index_.get(); if (iter != nullptr) { iter->Initialize(cmp, data_, restart_offset_, num_restarts, - hash_index_ptr, prefix_index_ptr); + prefix_index_ptr); } else { iter = new BlockIter(cmp, data_, restart_offset_, num_restarts, - hash_index_ptr, prefix_index_ptr); + prefix_index_ptr); } } return iter; } -void Block::SetBlockHashIndex(BlockHashIndex* hash_index) { - hash_index_.reset(hash_index); -} - void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { prefix_index_.reset(prefix_index); } size_t Block::ApproximateMemoryUsage() const { size_t usage = usable_size(); - if (hash_index_) { - usage += hash_index_->ApproximateMemoryUsage(); - } if (prefix_index_) { usage += prefix_index_->ApproximateMemoryUsage(); } diff --git a/external/rocksdb/table/block.h b/external/rocksdb/table/block.h index 2ce48d3fda..81ca2aa412 100644 --- a/external/rocksdb/table/block.h +++ b/external/rocksdb/table/block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,15 +10,18 @@ #pragma once #include #include +#include +#include #ifdef ROCKSDB_MALLOC_USABLE_SIZE #include #endif +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" -#include "db/dbformat.h" #include "table/block_prefix_index.h" -#include "table/block_hash_index.h" +#include "table/internal_iterator.h" #include "format.h" @@ -27,7 +30,6 @@ namespace rocksdb { struct BlockContents; class Comparator; class BlockIter; -class BlockHashIndex; class BlockPrefixIndex; class Block { @@ -66,9 +68,9 @@ class Block { // If total_order_seek is true, hash_index_ and prefix_index_ are ignored. // This option only applies for index block. For data block, hash_index_ // and prefix_index_ are null, so this option does not matter. - Iterator* NewIterator(const Comparator* comparator, - BlockIter* iter = nullptr, bool total_order_seek = true); - void SetBlockHashIndex(BlockHashIndex* hash_index); + InternalIterator* NewIterator(const Comparator* comparator, + BlockIter* iter = nullptr, + bool total_order_seek = true); void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index); // Report an approximation of how much memory has been used. @@ -79,7 +81,6 @@ class Block { const char* data_; // contents_.data.data() size_t size_; // contents_.data.size() uint32_t restart_offset_; // Offset in data_ of restart array - std::unique_ptr hash_index_; std::unique_ptr prefix_index_; // No copying allowed @@ -87,7 +88,7 @@ class Block { void operator=(const Block&); }; -class BlockIter : public Iterator { +class BlockIter : public InternalIterator { public: BlockIter() : comparator_(nullptr), @@ -97,20 +98,18 @@ class BlockIter : public Iterator { current_(0), restart_index_(0), status_(Status::OK()), - hash_index_(nullptr), - prefix_index_(nullptr) {} + prefix_index_(nullptr), + key_pinned_(false) {} BlockIter(const Comparator* comparator, const char* data, uint32_t restarts, - uint32_t num_restarts, BlockHashIndex* hash_index, - BlockPrefixIndex* prefix_index) + uint32_t num_restarts, BlockPrefixIndex* prefix_index) : BlockIter() { - Initialize(comparator, data, restarts, num_restarts, - hash_index, prefix_index); + Initialize(comparator, data, restarts, num_restarts, prefix_index); } void Initialize(const Comparator* comparator, const char* data, - uint32_t restarts, uint32_t num_restarts, BlockHashIndex* hash_index, - BlockPrefixIndex* prefix_index) { + uint32_t restarts, uint32_t num_restarts, + BlockPrefixIndex* prefix_index) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid @@ -120,7 +119,6 @@ class BlockIter : public Iterator { num_restarts_ = num_restarts; current_ = restarts_; restart_index_ = num_restarts_; - hash_index_ = hash_index; prefix_index_ = prefix_index; } @@ -149,6 +147,23 @@ class BlockIter : public Iterator { virtual void SeekToLast() override; +#ifndef NDEBUG + ~BlockIter() { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + } + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + + virtual bool IsKeyPinned() const override { return key_pinned_; } + + virtual bool IsValuePinned() const override { return true; } + private: const Comparator* comparator_; const char* data_; // underlying block contents @@ -161,8 +176,32 @@ class BlockIter : public Iterator { IterKey key_; Slice value_; Status status_; - BlockHashIndex* hash_index_; BlockPrefixIndex* prefix_index_; + bool key_pinned_; + + struct CachedPrevEntry { + explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, + size_t _key_offset, size_t _key_size, Slice _value) + : offset(_offset), + key_ptr(_key_ptr), + key_offset(_key_offset), + key_size(_key_size), + value(_value) {} + + // offset of entry in block + uint32_t offset; + // Pointer to key data in block (nullptr if key is delta-encoded) + const char* key_ptr; + // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) + size_t key_offset; + // size of key + size_t key_size; + // value slice pointing to data in block + Slice value; + }; + std::string prev_entries_keys_buff_; + std::vector prev_entries_; + int32_t prev_entries_idx_ = -1; inline int Compare(const Slice& a, const Slice& b) const { return comparator_->Compare(a, b); @@ -202,8 +241,6 @@ class BlockIter : public Iterator { uint32_t left, uint32_t right, uint32_t* index); - bool HashSeek(const Slice& target, uint32_t* index); - bool PrefixSeek(const Slice& target, uint32_t* index); }; diff --git a/external/rocksdb/table/block_based_filter_block.cc b/external/rocksdb/table/block_based_filter_block.cc index c33d485975..427c9fe9c0 100644 --- a/external/rocksdb/table/block_based_filter_block.cc +++ b/external/rocksdb/table/block_based_filter_block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,19 +19,6 @@ namespace rocksdb { namespace { -bool SamePrefix(const SliceTransform* prefix_extractor, - const Slice& key1, const Slice& key2) { - if (!prefix_extractor->InDomain(key1) && - !prefix_extractor->InDomain(key2)) { - return true; - } else if (!prefix_extractor->InDomain(key1) || - !prefix_extractor->InDomain(key2)) { - return false; - } else { - return (prefix_extractor->Transform(key1) == - prefix_extractor->Transform(key2)); - } -} void AppendItem(std::string* props, const std::string& key, const std::string& value) { @@ -78,7 +65,9 @@ BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( const BlockBasedTableOptions& table_opt) : policy_(table_opt.filter_policy.get()), prefix_extractor_(prefix_extractor), - whole_key_filtering_(table_opt.whole_key_filtering) { + whole_key_filtering_(table_opt.whole_key_filtering), + prev_prefix_start_(0), + prev_prefix_size_(0) { assert(policy_); } @@ -91,14 +80,13 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { } void BlockBasedFilterBlockBuilder::Add(const Slice& key) { - added_to_start_ = 0; - if (whole_key_filtering_) { - AddKey(key); - added_to_start_ = 1; - } if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { AddPrefix(key); } + + if (whole_key_filtering_) { + AddKey(key); + } } // Add key to filter if needed @@ -111,19 +99,16 @@ inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { // get slice for most recently added entry Slice prev; - if (start_.size() > added_to_start_) { - size_t prev_start = start_[start_.size() - 1 - added_to_start_]; - const char* base = entries_.data() + prev_start; - size_t length = entries_.size() - prev_start; - prev = Slice(base, length); + if (prev_prefix_size_ > 0) { + prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_); } - // this assumes prefix(prefix(key)) == prefix(key), as the last - // entry in entries_ may be either a key or prefix, and we use - // prefix(last entry) to get the prefix of the last key. - if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) { - Slice prefix = prefix_extractor_->Transform(key); + Slice prefix = prefix_extractor_->Transform(key); + // insert prefix only when it's different from the previous prefix. + if (prev.size() == 0 || prefix != prev) { start_.push_back(entries_.size()); + prev_prefix_start_ = entries_.size(); + prev_prefix_size_ = prefix.size(); entries_.append(prefix.data(), prefix.size()); } } @@ -169,15 +154,17 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { tmp_entries_.clear(); entries_.clear(); start_.clear(); + prev_prefix_start_ = 0; + prev_prefix_size_ = 0; } BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, bool whole_key_filtering, - BlockContents&& contents) - : policy_(table_opt.filter_policy.get()), + const BlockBasedTableOptions& table_opt, bool _whole_key_filtering, + BlockContents&& contents, Statistics* stats) + : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), + policy_(table_opt.filter_policy.get()), prefix_extractor_(prefix_extractor), - whole_key_filtering_(whole_key_filtering), data_(nullptr), offset_(nullptr), num_(0), diff --git a/external/rocksdb/table/block_based_filter_block.h b/external/rocksdb/table/block_based_filter_block.h index d339ac68a6..ca3f10e782 100644 --- a/external/rocksdb/table/block_based_filter_block.h +++ b/external/rocksdb/table/block_based_filter_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -55,9 +55,12 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { const SliceTransform* prefix_extractor_; bool whole_key_filtering_; + size_t prev_prefix_start_; // the position of the last appended prefix + // to "entries_". + size_t prev_prefix_size_; // the length of the last appended prefix to + // "entries_". std::string entries_; // Flattened entry contents std::vector start_; // Starting index in entries_ of each entry - uint32_t added_to_start_; // To indicate if key is added std::string result_; // Filter data computed so far std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; @@ -75,7 +78,7 @@ class BlockBasedFilterBlockReader : public FilterBlockReader { BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, const BlockBasedTableOptions& table_opt, bool whole_key_filtering, - BlockContents&& contents); + BlockContents&& contents, Statistics* statistics); virtual bool IsBlockBased() override { return true; } virtual bool KeyMayMatch(const Slice& key, uint64_t block_offset = kNotValid) override; @@ -89,7 +92,6 @@ class BlockBasedFilterBlockReader : public FilterBlockReader { private: const FilterPolicy* policy_; const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; const char* data_; // Pointer to filter data (at block-start) const char* offset_; // Pointer to beginning of offset array (at block-end) size_t num_; // Number of entries in offset array diff --git a/external/rocksdb/table/block_based_filter_block_test.cc b/external/rocksdb/table/block_based_filter_block_test.cc index 017de5906c..c28b0008d5 100644 --- a/external/rocksdb/table/block_based_filter_block_test.cc +++ b/external/rocksdb/table/block_based_filter_block_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -58,7 +58,7 @@ TEST_F(FilterBlockTest, EmptyBuilder) { BlockContents block(builder.Finish(), false, kNoCompression); ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block)); + std::move(block), nullptr); ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); ASSERT_TRUE(reader.KeyMayMatch("foo", 100000)); } @@ -75,7 +75,7 @@ TEST_F(FilterBlockTest, SingleChunk) { builder.Add("hello"); BlockContents block(builder.Finish(), false, kNoCompression); BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block)); + std::move(block), nullptr); ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); ASSERT_TRUE(reader.KeyMayMatch("bar", 100)); ASSERT_TRUE(reader.KeyMayMatch("box", 100)); @@ -107,7 +107,7 @@ TEST_F(FilterBlockTest, MultiChunk) { BlockContents block(builder.Finish(), false, kNoCompression); BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block)); + std::move(block), nullptr); // Check first filter ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); @@ -153,7 +153,7 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { BlockContents block(builder->Finish(), false, kNoCompression); ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block)); + nullptr, table_options_, true, std::move(block), nullptr); ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); ASSERT_TRUE(reader->KeyMayMatch("foo", 100000)); @@ -174,7 +174,7 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { builder->Add("hello"); BlockContents block(builder->Finish(), false, kNoCompression); FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block)); + nullptr, table_options_, true, std::move(block), nullptr); ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); ASSERT_TRUE(reader->KeyMayMatch("bar", 100)); ASSERT_TRUE(reader->KeyMayMatch("box", 100)); @@ -210,7 +210,7 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { BlockContents block(builder->Finish(), false, kNoCompression); FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block)); + nullptr, table_options_, true, std::move(block), nullptr); // Check first filter ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); diff --git a/external/rocksdb/table/block_based_table_builder.cc b/external/rocksdb/table/block_based_table_builder.cc index e13531529d..21720bdb74 100644 --- a/external/rocksdb/table/block_based_table_builder.cc +++ b/external/rocksdb/table/block_based_table_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,6 +26,7 @@ #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" +#include "rocksdb/merge_operator.h" #include "rocksdb/table.h" #include "table/block.h" @@ -113,15 +114,17 @@ class IndexBuilder { // // Optimizations: // 1. Made block's `block_restart_interval` to be 1, which will avoid linear -// search when doing index lookup. +// search when doing index lookup (can be disabled by setting +// index_block_restart_interval). // 2. Shorten the key length for index block. Other than honestly using the // last key in the data block as the index key, we instead find a shortest // substitute key that serves the same function. class ShortenedIndexBuilder : public IndexBuilder { public: - explicit ShortenedIndexBuilder(const Comparator* comparator) + explicit ShortenedIndexBuilder(const Comparator* comparator, + int index_block_restart_interval) : IndexBuilder(comparator), - index_block_builder_(1 /* block_restart_interval == 1 */) {} + index_block_builder_(index_block_restart_interval) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, @@ -178,9 +181,10 @@ class ShortenedIndexBuilder : public IndexBuilder { class HashIndexBuilder : public IndexBuilder { public: explicit HashIndexBuilder(const Comparator* comparator, - const SliceTransform* hash_key_extractor) + const SliceTransform* hash_key_extractor, + int index_block_restart_interval) : IndexBuilder(comparator), - primary_index_builder_(comparator), + primary_index_builder_(comparator, index_block_restart_interval), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -237,10 +241,10 @@ class HashIndexBuilder : public IndexBuilder { void FlushPendingPrefix() { prefix_block_.append(pending_entry_prefix_.data(), pending_entry_prefix_.size()); - PutVarint32(&prefix_meta_block_, - static_cast(pending_entry_prefix_.size())); - PutVarint32(&prefix_meta_block_, pending_entry_index_); - PutVarint32(&prefix_meta_block_, pending_block_num_); + PutVarint32Varint32Varint32( + &prefix_meta_block_, + static_cast(pending_entry_prefix_.size()), + pending_entry_index_, pending_block_num_); } ShortenedIndexBuilder primary_index_builder_; @@ -266,13 +270,16 @@ namespace { // Create a index builder based on its type. IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, - const SliceTransform* prefix_extractor) { + const SliceTransform* prefix_extractor, + int index_block_restart_interval) { switch (type) { case BlockBasedTableOptions::kBinarySearch: { - return new ShortenedIndexBuilder(comparator); + return new ShortenedIndexBuilder(comparator, + index_block_restart_interval); } case BlockBasedTableOptions::kHashSearch: { - return new HashIndexBuilder(comparator, prefix_extractor); + return new HashIndexBuilder(comparator, prefix_extractor, + index_block_restart_interval); } default: { assert(!"Do not recognize the index type "); @@ -309,6 +316,7 @@ bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { Slice CompressBlock(const Slice& raw, const CompressionOptions& compression_options, CompressionType* type, uint32_t format_version, + const Slice& compression_dict, std::string* compressed_output) { if (*type == kNoCompression) { return raw; @@ -328,7 +336,7 @@ Slice CompressBlock(const Slice& raw, if (Zlib_Compress( compression_options, GetCompressFormatForVersion(kZlibCompression, format_version), - raw.data(), raw.size(), compressed_output) && + raw.data(), raw.size(), compressed_output, compression_dict) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } @@ -346,7 +354,7 @@ Slice CompressBlock(const Slice& raw, if (LZ4_Compress( compression_options, GetCompressFormatForVersion(kLZ4Compression, format_version), - raw.data(), raw.size(), compressed_output) && + raw.data(), raw.size(), compressed_output, compression_dict) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } @@ -355,14 +363,21 @@ Slice CompressBlock(const Slice& raw, if (LZ4HC_Compress( compression_options, GetCompressFormatForVersion(kLZ4HCCompression, format_version), - raw.data(), raw.size(), compressed_output) && + raw.data(), raw.size(), compressed_output, compression_dict) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } break; // fall back to no compression. + case kXpressCompression: + if (XPRESS_Compress(raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; case kZSTDNotFinalCompression: if (ZSTD_Compress(compression_options, raw.data(), raw.size(), - compressed_output) && + compressed_output, compression_dict) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } @@ -455,6 +470,8 @@ struct BlockBasedTableBuilder::Rep { std::string last_key; const CompressionType compression_type; const CompressionOptions compression_opts; + // Data for presetting the compression library's dictionary, or nullptr. + const std::string* compression_dict; TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. @@ -466,6 +483,8 @@ struct BlockBasedTableBuilder::Rep { std::string compressed_output; std::unique_ptr flush_block_policy; + uint32_t column_family_id; + const std::string& column_family_name; std::vector> table_properties_collectors; @@ -474,27 +493,35 @@ struct BlockBasedTableBuilder::Rep { const InternalKeyComparator& icomparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFileWriter* f, const CompressionType _compression_type, - const CompressionOptions& _compression_opts, const bool skip_filters) + uint32_t _column_family_id, WritableFileWriter* f, + const CompressionType _compression_type, + const CompressionOptions& _compression_opts, + const std::string* _compression_dict, const bool skip_filters, + const std::string& _column_family_name) : ioptions(_ioptions), table_options(table_opt), internal_comparator(icomparator), file(f), - data_block(table_options.block_restart_interval), + data_block(table_options.block_restart_interval, + table_options.use_delta_encoding), internal_prefix_transform(_ioptions.prefix_extractor), - index_builder(CreateIndexBuilder(table_options.index_type, - &internal_comparator, - &this->internal_prefix_transform)), + index_builder( + CreateIndexBuilder(table_options.index_type, &internal_comparator, + &this->internal_prefix_transform, + table_options.index_block_restart_interval)), compression_type(_compression_type), compression_opts(_compression_opts), + compression_dict(_compression_dict), filter_block(skip_filters ? nullptr : CreateFilterBlockBuilder( _ioptions, table_options)), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( - table_options, data_block)) { + table_options, data_block)), + column_family_id(_column_family_id), + column_family_name(_column_family_name) { for (auto& collector_factories : *int_tbl_prop_collector_factories) { table_properties_collectors.emplace_back( - collector_factories->CreateIntTblPropCollector()); + collector_factories->CreateIntTblPropCollector(column_family_id)); } table_properties_collectors.emplace_back( new BlockBasedTablePropertiesCollector( @@ -509,8 +536,11 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFileWriter* file, const CompressionType compression_type, - const CompressionOptions& compression_opts, const bool skip_filters) { + uint32_t column_family_id, WritableFileWriter* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts, + const std::string* compression_dict, const bool skip_filters, + const std::string& column_family_name) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && sanitized_table_options.checksum != kCRC32c) { @@ -523,8 +553,9 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( } rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator, - int_tbl_prop_collector_factories, file, compression_type, - compression_opts, skip_filters); + int_tbl_prop_collector_factories, column_family_id, file, + compression_type, compression_opts, compression_dict, + skip_filters, column_family_name); if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); @@ -589,8 +620,8 @@ void BlockBasedTableBuilder::Flush() { assert(!r->closed); if (!ok()) return; if (r->data_block.empty()) return; - WriteBlock(&r->data_block, &r->pending_handle); - if (ok()) { + WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */); + if (ok() && !r->table_options.skip_table_builder_flush) { r->status = r->file->Flush(); } if (r->filter_block != nullptr) { @@ -601,13 +632,15 @@ void BlockBasedTableBuilder::Flush() { } void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, - BlockHandle* handle) { - WriteBlock(block->Finish(), handle); + BlockHandle* handle, + bool is_data_block) { + WriteBlock(block->Finish(), handle, is_data_block); block->Reset(); } void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, - BlockHandle* handle) { + BlockHandle* handle, + bool is_data_block) { // File format contains a sequence of blocks where each block has: // block_data: uint8[n] // type: uint8 @@ -617,15 +650,70 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, auto type = r->compression_type; Slice block_contents; + bool abort_compression = false; + + StopWatchNano timer(r->ioptions.env, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)); + if (raw_block_contents.size() < kCompressionSizeLimit) { - block_contents = - CompressBlock(raw_block_contents, r->compression_opts, &type, - r->table_options.format_version, &r->compressed_output); + Slice compression_dict; + if (is_data_block && r->compression_dict && r->compression_dict->size()) { + compression_dict = *r->compression_dict; + } + + block_contents = CompressBlock(raw_block_contents, r->compression_opts, + &type, r->table_options.format_version, + compression_dict, &r->compressed_output); + + // Some of the compression algorithms are known to be unreliable. If + // the verify_compression flag is set then try to de-compress the + // compressed data and compare to the input. + if (type != kNoCompression && r->table_options.verify_compression) { + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + Status stat = UncompressBlockContentsForCompressionType( + block_contents.data(), block_contents.size(), &contents, + r->table_options.format_version, compression_dict, type, + r->ioptions); + + if (stat.ok()) { + bool compressed_ok = contents.data.compare(raw_block_contents) == 0; + if (!compressed_ok) { + // The result of the compression was invalid. abort. + abort_compression = true; + Log(InfoLogLevel::ERROR_LEVEL, r->ioptions.info_log, + "Decompressed block did not match raw block"); + r->status = + Status::Corruption("Decompressed block did not match raw block"); + } + } else { + // Decompression reported an error. abort. + r->status = Status::Corruption("Could not decompress"); + abort_compression = true; + } + } } else { + // Block is too big to be compressed. + abort_compression = true; + } + + // Abort compression if the block is too big, or did not pass + // verification. + if (abort_compression) { RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); type = kNoCompression; block_contents = raw_block_contents; } + else if (type != kNoCompression && + ShouldReportDetailedTime(r->ioptions.env, + r->ioptions.statistics)) { + MeasureTime(r->ioptions.statistics, COMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + MeasureTime(r->ioptions.statistics, BYTES_COMPRESSED, + raw_block_contents.size()); + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED); + } + WriteRawBlock(block_contents, type, handle); r->compressed_output.clear(); } @@ -693,7 +781,6 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, if (type != kNoCompression && block_cache_compressed != nullptr) { - Cache::Handle* cache_handle = nullptr; size_t size = block_contents.size(); std::unique_ptr ubuf(new char[size + 1]); @@ -713,9 +800,8 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, (end - r->compressed_cache_key_prefix)); // Insert into compressed block cache. - cache_handle = block_cache_compressed->Insert( - key, block, block->usable_size(), &DeleteCachedBlock); - block_cache_compressed->Release(cache_handle); + block_cache_compressed->Insert(key, block, block->usable_size(), + &DeleteCachedBlock); // Invalidate OS cache. r->file->InvalidateCache(static_cast(r->offset), size); @@ -730,7 +816,8 @@ Status BlockBasedTableBuilder::Finish() { assert(!r->closed); r->closed = true; - BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle; + BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle, + compression_dict_block_handle; // Write filter block if (ok() && r->filter_block != nullptr) { auto filter_contents = r->filter_block->Finish(); @@ -761,7 +848,7 @@ Status BlockBasedTableBuilder::Finish() { MetaIndexBuilder meta_index_builder; for (const auto& item : index_blocks.meta_blocks) { BlockHandle block_handle; - WriteBlock(item.second, &block_handle); + WriteBlock(item.second, &block_handle, false /* is_data_block */); meta_index_builder.Add(item.first, block_handle); } @@ -779,13 +866,35 @@ Status BlockBasedTableBuilder::Finish() { meta_index_builder.Add(key, filter_block_handle); } - // Write properties block. + // Write properties and compression dictionary blocks. { PropertyBlockBuilder property_block_builder; + r->props.column_family_id = r->column_family_id; + r->props.column_family_name = r->column_family_name; r->props.filter_policy_name = r->table_options.filter_policy != nullptr ? r->table_options.filter_policy->Name() : ""; r->props.index_size = r->index_builder->EstimatedSize() + kBlockTrailerSize; + r->props.comparator_name = r->ioptions.comparator != nullptr + ? r->ioptions.comparator->Name() + : "nullptr"; + r->props.merge_operator_name = r->ioptions.merge_operator != nullptr + ? r->ioptions.merge_operator->Name() + : "nullptr"; + r->props.compression_name = CompressionTypeToString(r->compression_type); + + std::string property_collectors_names = "["; + property_collectors_names = "["; + for (size_t i = 0; + i < r->ioptions.table_properties_collector_factories.size(); ++i) { + if (i != 0) { + property_collectors_names += ","; + } + property_collectors_names += + r->ioptions.table_properties_collector_factories[i]->Name(); + } + property_collectors_names += "]"; + r->props.property_collectors_names = property_collectors_names; // Add basic properties property_block_builder.AddTableProperty(r->props); @@ -801,9 +910,16 @@ Status BlockBasedTableBuilder::Finish() { kNoCompression, &properties_block_handle ); - meta_index_builder.Add(kPropertiesBlock, properties_block_handle); - } // end of properties block writing + + // Write compression dictionary block + if (r->compression_dict && r->compression_dict->size()) { + WriteRawBlock(*r->compression_dict, kNoCompression, + &compression_dict_block_handle); + meta_index_builder.Add(kCompressionDictBlock, + compression_dict_block_handle); + } + } // end of properties/compression dictionary block writing } // meta blocks // Write index block @@ -811,7 +927,8 @@ Status BlockBasedTableBuilder::Finish() { // flush the meta index block WriteRawBlock(meta_index_builder.Finish(), kNoCompression, &metaindex_block_handle); - WriteBlock(index_blocks.index_block_contents, &index_block_handle); + WriteBlock(index_blocks.index_block_contents, &index_block_handle, + false /* is_data_block */); } // Write footer @@ -871,8 +988,9 @@ TableProperties BlockBasedTableBuilder::GetTableProperties() const { TableProperties ret = rep_->props; for (const auto& collector : rep_->table_properties_collectors) { for (const auto& prop : collector->GetReadableProperties()) { - ret.user_collected_properties.insert(prop); + ret.readable_properties.insert(prop); } + collector->Finish(&ret.user_collected_properties); } return ret; } diff --git a/external/rocksdb/table/block_based_table_builder.h b/external/rocksdb/table/block_based_table_builder.h index ce868207a0..8172c238ec 100644 --- a/external/rocksdb/table/block_based_table_builder.h +++ b/external/rocksdb/table/block_based_table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -34,14 +34,19 @@ class BlockBasedTableBuilder : public TableBuilder { // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). + // @param compression_dict Data for presetting the compression library's + // dictionary, or nullptr. BlockBasedTableBuilder( const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFileWriter* file, const CompressionType compression_type, - const CompressionOptions& compression_opts, const bool skip_filters); + uint32_t column_family_id, WritableFileWriter* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts, + const std::string* compression_dict, const bool skip_filters, + const std::string& column_family_name); // REQUIRES: Either Finish() or Abandon() has been called. ~BlockBasedTableBuilder(); @@ -80,11 +85,14 @@ class BlockBasedTableBuilder : public TableBuilder { private: bool ok() const { return status().ok(); } + // Call block's Finish() method and then write the finalize block contents to // file. - void WriteBlock(BlockBuilder* block, BlockHandle* handle); + void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block); + // Directly write block content to the file. - void WriteBlock(const Slice& block_contents, BlockHandle* handle); + void WriteBlock(const Slice& block_contents, BlockHandle* handle, + bool is_data_block); void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle); Status InsertBlockInCache(const Slice& block_contents, const CompressionType type, diff --git a/external/rocksdb/table/block_based_table_factory.cc b/external/rocksdb/table/block_based_table_factory.cc index ea910c6b2e..e9499cd676 100644 --- a/external/rocksdb/table/block_based_table_factory.cc +++ b/external/rocksdb/table/block_based_table_factory.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,8 +24,8 @@ namespace rocksdb { BlockBasedTableFactory::BlockBasedTableFactory( - const BlockBasedTableOptions& table_options) - : table_options_(table_options) { + const BlockBasedTableOptions& _table_options) + : table_options_(_table_options) { if (table_options_.flush_block_policy_factory == nullptr) { table_options_.flush_block_policy_factory.reset( new FlushBlockBySizePolicyFactory()); @@ -39,37 +39,38 @@ BlockBasedTableFactory::BlockBasedTableFactory( table_options_.block_size_deviation > 100) { table_options_.block_size_deviation = 0; } + if (table_options_.block_restart_interval < 1) { + table_options_.block_restart_interval = 1; + } + if (table_options_.index_block_restart_interval < 1) { + table_options_.index_block_restart_interval = 1; + } } Status BlockBasedTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const { - return NewTableReader(table_reader_options, std::move(file), file_size, - table_reader, - /*prefetch_index_and_filter=*/true); -} - -Status BlockBasedTableFactory::NewTableReader( - const TableReaderOptions& table_reader_options, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader, const bool prefetch_enabled) const { + unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache) const { return BlockBasedTable::Open( table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), - file_size, table_reader, prefetch_enabled); + file_size, table_reader, prefetch_index_and_filter_in_cache, + table_reader_options.skip_filters, table_reader_options.level); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, WritableFileWriter* file) const { auto table_builder = new BlockBasedTableBuilder( table_builder_options.ioptions, table_options_, table_builder_options.internal_comparator, - table_builder_options.int_tbl_prop_collector_factories, file, - table_builder_options.compression_type, + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_builder_options.compression_type, table_builder_options.compression_opts, - table_builder_options.skip_filters); + table_builder_options.compression_dict, + table_builder_options.skip_filters, + table_builder_options.column_family_name); return table_builder; } @@ -87,6 +88,12 @@ Status BlockBasedTableFactory::SanitizeOptions( return Status::InvalidArgument("Enable cache_index_and_filter_blocks, " ", but block cache is disabled"); } + if (table_options_.pin_l0_filter_and_index_blocks_in_cache && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable pin_l0_filter_and_index_blocks_in_cache, " + ", but block cache is disabled"); + } if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { return Status::InvalidArgument( "Unsupported BlockBasedTable format_version. Please check " @@ -103,11 +110,15 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", table_options_.flush_block_policy_factory->Name(), - table_options_.flush_block_policy_factory.get()); + static_cast(table_options_.flush_block_policy_factory.get())); ret.append(buffer); snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", table_options_.cache_index_and_filter_blocks); ret.append(buffer); + snprintf(buffer, kBufferSize, + " pin_l0_filter_and_index_blocks_in_cache: %d\n", + table_options_.pin_l0_filter_and_index_blocks_in_cache); + ret.append(buffer); snprintf(buffer, kBufferSize, " index_type: %d\n", table_options_.index_type); ret.append(buffer); @@ -121,7 +132,7 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { table_options_.no_block_cache); ret.append(buffer); snprintf(buffer, kBufferSize, " block_cache: %p\n", - table_options_.block_cache.get()); + static_cast(table_options_.block_cache.get())); ret.append(buffer); if (table_options_.block_cache) { snprintf(buffer, kBufferSize, " block_cache_size: %" ROCKSDB_PRIszt "\n", @@ -129,7 +140,7 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { ret.append(buffer); } snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", - table_options_.block_cache_compressed.get()); + static_cast(table_options_.block_cache_compressed.get())); ret.append(buffer); if (table_options_.block_cache_compressed) { snprintf(buffer, kBufferSize, @@ -146,25 +157,32 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", table_options_.block_restart_interval); ret.append(buffer); + snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", + table_options_.index_block_restart_interval); + ret.append(buffer); snprintf(buffer, kBufferSize, " filter_policy: %s\n", table_options_.filter_policy == nullptr ? "nullptr" : table_options_.filter_policy->Name()); ret.append(buffer); snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", table_options_.whole_key_filtering); + ret.append(buffer); + snprintf(buffer, kBufferSize, " skip_table_builder_flush: %d\n", + table_options_.skip_table_builder_flush); + ret.append(buffer); snprintf(buffer, kBufferSize, " format_version: %d\n", table_options_.format_version); ret.append(buffer); return ret; } -const BlockBasedTableOptions& BlockBasedTableFactory::GetTableOptions() const { +const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const { return table_options_; } TableFactory* NewBlockBasedTableFactory( - const BlockBasedTableOptions& table_options) { - return new BlockBasedTableFactory(table_options); + const BlockBasedTableOptions& _table_options) { + return new BlockBasedTableFactory(_table_options); } const std::string BlockBasedTablePropertyNames::kIndexType = diff --git a/external/rocksdb/table/block_based_table_factory.h b/external/rocksdb/table/block_based_table_factory.h index 8bdd4cd742..6ecb232e49 100644 --- a/external/rocksdb/table/block_based_table_factory.h +++ b/external/rocksdb/table/block_based_table_factory.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,22 +33,15 @@ class BlockBasedTableFactory : public TableFactory { const char* Name() const override { return "BlockBasedTable"; } - Status NewTableReader(const TableReaderOptions& table_reader_options, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table_reader) const override; - - // This is a variant of virtual member function NewTableReader function with - // added capability to disable pre-fetching of blocks on BlockBasedTable::Open - Status NewTableReader(const TableReaderOptions& table_reader_options, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table_reader, - bool prefetch_index_and_filter) const; + Status NewTableReader( + const TableReaderOptions& table_reader_options, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFileWriter* file) const override; + uint32_t column_family_id, WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, @@ -56,7 +49,9 @@ class BlockBasedTableFactory : public TableFactory { std::string GetPrintableTableOptions() const override; - const BlockBasedTableOptions& GetTableOptions() const; + const BlockBasedTableOptions& table_options() const; + + void* GetOptions() override { return &table_options_; } private: BlockBasedTableOptions table_options_; diff --git a/external/rocksdb/table/block_based_table_reader.cc b/external/rocksdb/table/block_based_table_reader.cc index b11327248c..7b98a48889 100644 --- a/external/rocksdb/table/block_based_table_reader.cc +++ b/external/rocksdb/table/block_based_table_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,13 +6,13 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - #include "table/block_based_table_reader.h" #include #include #include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -25,22 +25,24 @@ #include "rocksdb/table_properties.h" #include "table/block.h" -#include "table/filter_block.h" #include "table/block_based_filter_block.h" #include "table/block_based_table_factory.h" -#include "table/full_filter_block.h" -#include "table/block_hash_index.h" #include "table/block_prefix_index.h" +#include "table/filter_block.h" #include "table/format.h" +#include "table/full_filter_block.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" #include "table/meta_blocks.h" +#include "table/persistent_cache_helper.h" #include "table/two_level_iterator.h" -#include "table/get_context.h" #include "util/coding.h" #include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" #include "util/string_util.h" +#include "util/sync_point.h" namespace rocksdb { @@ -52,24 +54,21 @@ using std::unique_ptr; typedef BlockBasedTable::IndexReader IndexReader; namespace { -// The longest the prefix of the cache key used to identify blocks can be. -// We are using the fact that we know for Posix files the unique ID is three -// varints. -// For some reason, compiling for iOS complains that this variable is unused -const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = - kMaxVarint64Length * 3 + 1; - // Read the block identified by "handle" from "file". // The only relevant option is options.verify_checksums for now. // On failure return non-OK. // On success fill *result and return OK - caller owns *result +// @param compression_dict Data for presetting the compression library's +// dictionary. Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - std::unique_ptr* result, Env* env, - bool do_uncompress = true) { + std::unique_ptr* result, + const ImmutableCFOptions &ioptions, + bool do_uncompress, const Slice& compression_dict, + const PersistentCacheOptions& cache_options) { BlockContents contents; - Status s = ReadBlockContents(file, footer, options, handle, &contents, env, - do_uncompress); + Status s = ReadBlockContents(file, footer, options, handle, &contents, ioptions, + do_uncompress, compression_dict, cache_options); if (s.ok()) { result->reset(new Block(std::move(contents))); } @@ -90,6 +89,9 @@ void DeleteCachedEntry(const Slice& key, void* value) { delete entry; } +void DeleteCachedFilterEntry(const Slice& key, void* value); +void DeleteCachedIndexEntry(const Slice& key, void* value); + // Release the cached entry and decrement its ref count. void ReleaseCachedEntry(void* arg, void* h) { Cache* cache = reinterpret_cast(arg); @@ -97,14 +99,14 @@ void ReleaseCachedEntry(void* arg, void* h) { cache->Release(handle); } -Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, - const BlockHandle& handle, char* cache_key) { +Slice GetCacheKeyFromOffset(const char* cache_key_prefix, + size_t cache_key_prefix_size, uint64_t offset, + char* cache_key) { assert(cache_key != nullptr); assert(cache_key_prefix_size != 0); - assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize); memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); - char* end = - EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset); return Slice(cache_key, static_cast(end - cache_key)); } @@ -138,28 +140,32 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, // IndexReader is the interface that provide the functionality for index access. class BlockBasedTable::IndexReader { public: - explicit IndexReader(const Comparator* comparator) - : comparator_(comparator) {} + explicit IndexReader(const Comparator* comparator, Statistics* stats) + : comparator_(comparator), statistics_(stats) {} virtual ~IndexReader() {} // Create an iterator for index access. // An iter is passed in, if it is not null, update this one and return it // If it is null, create a new Iterator - virtual Iterator* NewIterator( - BlockIter* iter = nullptr, bool total_order_seek = true) = 0; + virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, + bool total_order_seek = true) = 0; // The size of the index. virtual size_t size() const = 0; // Memory usage of the index block virtual size_t usable_size() const = 0; - + // return the statistics pointer + virtual Statistics* statistics() const { return statistics_; } // Report an approximation of how much memory has been used other than memory // that was allocated in block cache. virtual size_t ApproximateMemoryUsage() const = 0; protected: const Comparator* comparator_; + + private: + Statistics* statistics_; }; // Index that allows binary search lookup for the first key of each block. @@ -172,23 +178,25 @@ class BinarySearchIndexReader : public IndexReader { // On success, index_reader will be populated; otherwise it will remain // unmodified. static Status Create(RandomAccessFileReader* file, const Footer& footer, - const BlockHandle& index_handle, Env* env, - const Comparator* comparator, - IndexReader** index_reader) { + const BlockHandle& index_handle, + const ImmutableCFOptions &ioptions, + const Comparator* comparator, IndexReader** index_reader, + const PersistentCacheOptions& cache_options) { std::unique_ptr index_block; auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle, - &index_block, env); + &index_block, ioptions, true /* decompress */, + Slice() /*compression dict*/, cache_options); if (s.ok()) { - *index_reader = - new BinarySearchIndexReader(comparator, std::move(index_block)); + *index_reader = new BinarySearchIndexReader( + comparator, std::move(index_block), ioptions.statistics); } return s; } - virtual Iterator* NewIterator( - BlockIter* iter = nullptr, bool dont_care = true) override { + virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, + bool dont_care = true) override { return index_block_->NewIterator(comparator_, iter, true); } @@ -204,8 +212,9 @@ class BinarySearchIndexReader : public IndexReader { private: BinarySearchIndexReader(const Comparator* comparator, - std::unique_ptr&& index_block) - : IndexReader(comparator), index_block_(std::move(index_block)) { + std::unique_ptr&& index_block, + Statistics* stats) + : IndexReader(comparator, stats), index_block_(std::move(index_block)) { assert(index_block_ != nullptr); } std::unique_ptr index_block_; @@ -215,15 +224,17 @@ class BinarySearchIndexReader : public IndexReader { // key. class HashIndexReader : public IndexReader { public: - static Status Create(const SliceTransform* hash_key_extractor, - const Footer& footer, RandomAccessFileReader* file, - Env* env, const Comparator* comparator, - const BlockHandle& index_handle, - Iterator* meta_index_iter, IndexReader** index_reader, - bool hash_index_allow_collision) { + static Status Create( + const SliceTransform* hash_key_extractor, const Footer& footer, + RandomAccessFileReader* file, const ImmutableCFOptions &ioptions, + const Comparator* comparator, const BlockHandle& index_handle, + InternalIterator* meta_index_iter, IndexReader** index_reader, + bool hash_index_allow_collision, + const PersistentCacheOptions& cache_options) { std::unique_ptr index_block; auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle, - &index_block, env); + &index_block, ioptions, true /* decompress */, + Slice() /*compression dict*/, cache_options); if (!s.ok()) { return s; @@ -234,7 +245,8 @@ class HashIndexReader : public IndexReader { // So, Create will succeed regardless, from this point on. auto new_index_reader = - new HashIndexReader(comparator, std::move(index_block)); + new HashIndexReader(comparator, std::move(index_block), + ioptions.statistics); *index_reader = new_index_reader; // Get prefixes block @@ -258,48 +270,33 @@ class HashIndexReader : public IndexReader { // Read contents for the blocks BlockContents prefixes_contents; s = ReadBlockContents(file, footer, ReadOptions(), prefixes_handle, - &prefixes_contents, env, true /* do decompression */); + &prefixes_contents, ioptions, true /* decompress */, + Slice() /*compression dict*/, cache_options); if (!s.ok()) { return s; } BlockContents prefixes_meta_contents; s = ReadBlockContents(file, footer, ReadOptions(), prefixes_meta_handle, - &prefixes_meta_contents, env, - true /* do decompression */); + &prefixes_meta_contents, ioptions, true /* decompress */, + Slice() /*compression dict*/, cache_options); if (!s.ok()) { // TODO: log error return Status::OK(); } - if (!hash_index_allow_collision) { - // TODO: deprecate once hash_index_allow_collision proves to be stable. - BlockHashIndex* hash_index = nullptr; - s = CreateBlockHashIndex(hash_key_extractor, - prefixes_contents.data, - prefixes_meta_contents.data, - &hash_index); - // TODO: log error - if (s.ok()) { - new_index_reader->index_block_->SetBlockHashIndex(hash_index); - new_index_reader->OwnPrefixesContents(std::move(prefixes_contents)); - } - } else { - BlockPrefixIndex* prefix_index = nullptr; - s = BlockPrefixIndex::Create(hash_key_extractor, - prefixes_contents.data, - prefixes_meta_contents.data, - &prefix_index); - // TODO: log error - if (s.ok()) { - new_index_reader->index_block_->SetBlockPrefixIndex(prefix_index); - } + BlockPrefixIndex* prefix_index = nullptr; + s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data, + prefixes_meta_contents.data, &prefix_index); + // TODO: log error + if (s.ok()) { + new_index_reader->index_block_->SetBlockPrefixIndex(prefix_index); } return Status::OK(); } - virtual Iterator* NewIterator( - BlockIter* iter = nullptr, bool total_order_seek = true) override { + virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, + bool total_order_seek = true) override { return index_block_->NewIterator(comparator_, iter, total_order_seek); } @@ -316,31 +313,49 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const Comparator* comparator, - std::unique_ptr&& index_block) - : IndexReader(comparator), index_block_(std::move(index_block)) { + std::unique_ptr&& index_block, Statistics* stats) + : IndexReader(comparator, stats), index_block_(std::move(index_block)) { assert(index_block_ != nullptr); } ~HashIndexReader() { } - void OwnPrefixesContents(BlockContents&& prefixes_contents) { - prefixes_contents_ = std::move(prefixes_contents); - } - std::unique_ptr index_block_; BlockContents prefixes_contents_; }; +// CachableEntry represents the entries that *may* be fetched from block cache. +// field `value` is the item we want to get. +// field `cache_handle` is the cache handle to the block cache. If the value +// was not read from cache, `cache_handle` will be nullptr. +template +struct BlockBasedTable::CachableEntry { + CachableEntry(TValue* _value, Cache::Handle* _cache_handle) + : value(_value), cache_handle(_cache_handle) {} + CachableEntry() : CachableEntry(nullptr, nullptr) {} + void Release(Cache* cache) { + if (cache_handle) { + cache->Release(cache_handle); + value = nullptr; + cache_handle = nullptr; + } + } + bool IsSet() const { return cache_handle != nullptr; } + + TValue* value = nullptr; + // if the entry is from the cache, cache_handle will be populated. + Cache::Handle* cache_handle = nullptr; +}; struct BlockBasedTable::Rep { Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, - const InternalKeyComparator& _internal_comparator) + const InternalKeyComparator& _internal_comparator, bool skip_filters) : ioptions(_ioptions), env_options(_env_options), table_options(_table_opt), - filter_policy(_table_opt.filter_policy.get()), + filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), internal_comparator(_internal_comparator), filter_type(FilterType::kNoFilter), whole_key_filtering(_table_opt.whole_key_filtering), @@ -355,8 +370,13 @@ struct BlockBasedTable::Rep { unique_ptr file; char cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t cache_key_prefix_size = 0; + char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t persistent_cache_key_prefix_size = 0; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size = 0; + uint64_t dummy_index_reader_offset = + 0; // ID that is unique for the block cache. + PersistentCacheOptions persistent_cache_options; // Footer contains the fixed table information Footer footer; @@ -375,6 +395,11 @@ struct BlockBasedTable::Rep { BlockHandle filter_handle; std::shared_ptr table_properties; + // Block containing the data for the compression dictionary. We take ownership + // for the entire block struct, even though we only use its Slice member. This + // is easier because the Slice member depends on the continued existence of + // another member ("allocation"). + std::unique_ptr compression_dict_block; BlockBasedTableOptions::IndexType index_type; bool hash_index_allow_collision; bool whole_key_filtering; @@ -384,42 +409,37 @@ struct BlockBasedTable::Rep { // and compatible with existing code, we introduce a wrapper that allows // block to extract prefix without knowing if a key is internal or not. unique_ptr internal_prefix_transform; + + // only used in level 0 files: + // when pin_l0_filter_and_index_blocks_in_cache is true, we do use the + // LRU cache, but we always keep the filter & idndex block's handle checked + // out here (=we don't call Release()), plus the parsed out objects + // the LRU cache will never push flush them out, hence they're pinned + CachableEntry filter_entry; + CachableEntry index_entry; }; BlockBasedTable::~BlockBasedTable() { + Close(); delete rep_; } -// CachableEntry represents the entries that *may* be fetched from block cache. -// field `value` is the item we want to get. -// field `cache_handle` is the cache handle to the block cache. If the value -// was not read from cache, `cache_handle` will be nullptr. -template -struct BlockBasedTable::CachableEntry { - CachableEntry(TValue* _value, Cache::Handle* _cache_handle) - : value(_value), cache_handle(_cache_handle) {} - CachableEntry() : CachableEntry(nullptr, nullptr) {} - void Release(Cache* cache) { - if (cache_handle) { - cache->Release(cache_handle); - value = nullptr; - cache_handle = nullptr; - } - } - - TValue* value = nullptr; - // if the entry is from the cache, cache_handle will be populated. - Cache::Handle* cache_handle = nullptr; -}; - // Helper function to setup the cache key's prefix for the Table. -void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; if (rep->table_options.block_cache != nullptr) { GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); + // Create dummy offset of index reader which is beyond the file size. + rep->dummy_index_reader_offset = + file_size + rep->table_options.block_cache->NewId(); + } + if (rep->table_options.persistent_cache != nullptr) { + GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), + &rep->persistent_cache_key_prefix[0], + &rep->persistent_cache_key_prefix_size); } if (rep->table_options.block_cache_compressed != nullptr) { GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), @@ -436,7 +456,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, // If the prefix wasn't generated or was too long, // create one from the cache. - if (*size == 0) { + if (cc && *size == 0) { char* end = EncodeVarint64(buffer, cc->NewId()); *size = static_cast(end - buffer); } @@ -477,6 +497,18 @@ bool IsFeatureSupported(const TableProperties& table_properties, } } // namespace +Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = + EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + return Slice(cache_key, static_cast(end - cache_key)); +} + Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, @@ -484,7 +516,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, - const bool prefetch_index_and_filter) { + const bool prefetch_index_and_filter_in_cache, + const bool skip_filters, const int level) { table_reader->reset(); Footer footer; @@ -501,18 +534,32 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // We've successfully read the footer and the index block: we're // ready to serve requests. - Rep* rep = new BlockBasedTable::Rep( - ioptions, env_options, table_options, internal_comparator); + // Better not mutate rep_ after the creation. eg. internal_prefix_transform + // raw pointer will be used to create HashIndexReader, whose reset may + // access a dangling pointer. + Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, + internal_comparator, skip_filters); rep->file = std::move(file); rep->footer = footer; rep->index_type = table_options.index_type; rep->hash_index_allow_collision = table_options.hash_index_allow_collision; - SetupCacheKeyPrefix(rep); + // We need to wrap data with internal_prefix_transform to make sure it can + // handle prefix correctly. + rep->internal_prefix_transform.reset( + new InternalKeySliceTransform(rep->ioptions.prefix_extractor)); + SetupCacheKeyPrefix(rep, file_size); unique_ptr new_table(new BlockBasedTable(rep)); + // page cache options + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + std::string(rep->persistent_cache_key_prefix, + rep->persistent_cache_key_prefix_size), + rep->ioptions.statistics); + // Read meta index std::unique_ptr meta; - std::unique_ptr meta_iter; + std::unique_ptr meta_iter; s = ReadMetaBlock(rep, &meta, &meta_iter); if (!s.ok()) { return s; @@ -546,8 +593,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, TableProperties* table_properties = nullptr; if (s.ok()) { s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer, - rep->ioptions.env, rep->ioptions.info_log, - &table_properties); + rep->ioptions, &table_properties); } if (!s.ok()) { @@ -562,6 +608,31 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, "Cannot find Properties block from file."); } + // Read the compression dictionary meta block + bool found_compression_dict; + s = SeekToCompressionDictBlock(meta_iter.get(), &found_compression_dict); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log, + "Cannot seek to compression dictionary block from file: %s", + s.ToString().c_str()); + } else if (found_compression_dict) { + // TODO(andrewkr): Add to block cache if cache_index_and_filter_blocks is + // true. + unique_ptr compression_dict_block{new BlockContents()}; + s = rocksdb::ReadMetaBlock(rep->file.get(), file_size, + kBlockBasedTableMagicNumber, rep->ioptions, + rocksdb::kCompressionDictBlock, + compression_dict_block.get()); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log, + "Encountered error while reading data from compression dictionary " + "block %s", + s.ToString().c_str()); + } else { + rep->compression_dict_block = std::move(compression_dict_block); + } + } + // Determine whether whole key filtering is supported. if (rep->table_properties) { rep->whole_key_filtering &= @@ -573,38 +644,59 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log); } - if (prefetch_index_and_filter) { // pre-fetching of blocks is turned on - // Will use block cache for index/filter blocks access? - if (table_options.cache_index_and_filter_blocks) { + // Will use block cache for index/filter blocks access + // Always prefetch index and filter for level 0 + if (table_options.cache_index_and_filter_blocks) { + if (prefetch_index_and_filter_in_cache || level == 0) { assert(table_options.block_cache != nullptr); // Hack: Call NewIndexIterator() to implicitly add index to the // block_cache - unique_ptr iter(new_table->NewIndexIterator(ReadOptions())); + + // if pin_l0_filter_and_index_blocks_in_cache is true and this is + // a level0 file, then we will pass in this pointer to rep->index + // to NewIndexIterator(), which will save the index block in there + // else it's a nullptr and nothing special happens + CachableEntry* index_entry = nullptr; + if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache && + level == 0) { + index_entry = &rep->index_entry; + } + unique_ptr iter( + new_table->NewIndexIterator(ReadOptions(), nullptr, index_entry)); s = iter->status(); if (s.ok()) { // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = new_table->GetFilter(); - filter_entry.Release(table_options.block_cache.get()); + // if pin_l0_filter_and_index_blocks_in_cache is true, and this is + // a level0 file, then save it in rep_->filter_entry; it will be + // released in the destructor only, hence it will be pinned in the + // cache while this reader is alive + if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache && + level == 0) { + rep->filter_entry = filter_entry; + } else { + filter_entry.Release(table_options.block_cache.get()); + } } - } else { - // If we don't use block cache for index/filter blocks access, we'll - // pre-load these blocks, which will kept in member variables in Rep - // and with a same life-time as this table object. - IndexReader* index_reader = nullptr; - s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); + } + } else { + // If we don't use block cache for index/filter blocks access, we'll + // pre-load these blocks, which will kept in member variables in Rep + // and with a same life-time as this table object. + IndexReader* index_reader = nullptr; + s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); - if (s.ok()) { - rep->index_reader.reset(index_reader); + if (s.ok()) { + rep->index_reader.reset(index_reader); - // Set filter block - if (rep->filter_policy) { - rep->filter.reset(ReadFilter(rep, nullptr)); - } - } else { - delete index_reader; + // Set filter block + if (rep->filter_policy) { + rep->filter.reset(ReadFilter(rep)); } + } else { + delete index_reader; } } @@ -652,21 +744,18 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const { // Load the meta-block from the file. On success, return the loaded meta block // and its iterator. -Status BlockBasedTable::ReadMetaBlock( - Rep* rep, - std::unique_ptr* meta_block, - std::unique_ptr* iter) { +Status BlockBasedTable::ReadMetaBlock(Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. // TODO: we never really verify check sum for meta index block std::unique_ptr meta; Status s = ReadBlockFromFile( - rep->file.get(), - rep->footer, - ReadOptions(), - rep->footer.metaindex_handle(), - &meta, - rep->ioptions.env); + rep->file.get(), rep->footer, ReadOptions(), + rep->footer.metaindex_handle(), &meta, rep->ioptions, + true /* decompress */, Slice() /*compression dict*/, + rep->persistent_cache_options); if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log, @@ -683,12 +772,14 @@ Status BlockBasedTable::ReadMetaBlock( Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, - const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block, uint32_t format_version) { + Cache* block_cache, Cache* block_cache_compressed, + const ImmutableCFOptions &ioptions, const ReadOptions& read_options, + BlockBasedTable::CachableEntry* block, uint32_t format_version, + const Slice& compression_dict) { Status s; Block* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; + Statistics* statistics = ioptions.statistics; // Lookup uncompressed cache first if (block_cache != nullptr) { @@ -729,7 +820,8 @@ Status BlockBasedTable::GetDataBlockFromCache( BlockContents contents; s = UncompressBlockContents(compressed_block->data(), compressed_block->size(), &contents, - format_version); + format_version, compression_dict, + ioptions); // Insert uncompressed block into block cache if (s.ok()) { @@ -737,11 +829,16 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { - block->cache_handle = block_cache->Insert(block_cache_key, block->value, - block->value->usable_size(), - &DeleteCachedEntry); - assert(reinterpret_cast( - block_cache->Value(block->cache_handle)) == block->value); + s = block_cache->Insert( + block_cache_key, block->value, block->value->usable_size(), + &DeleteCachedEntry, &(block->cache_handle)); + if (s.ok()) { + RecordTick(statistics, BLOCK_CACHE_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete block->value; + block->value = nullptr; + } } } @@ -753,17 +850,19 @@ Status BlockBasedTable::GetDataBlockFromCache( Status BlockBasedTable::PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, Statistics* statistics, - CachableEntry* block, Block* raw_block, uint32_t format_version) { + const ReadOptions& read_options, const ImmutableCFOptions &ioptions, + CachableEntry* block, Block* raw_block, uint32_t format_version, + const Slice& compression_dict) { assert(raw_block->compression_type() == kNoCompression || block_cache_compressed != nullptr); Status s; // Retrieve the uncompressed contents into a new buffer BlockContents contents; + Statistics* statistics = ioptions.statistics; if (raw_block->compression_type() != kNoCompression) { s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents, - format_version); + format_version, compression_dict, ioptions); } if (!s.ok()) { delete raw_block; @@ -781,33 +880,43 @@ Status BlockBasedTable::PutDataBlockToCache( // Release the hold on the compressed cache entry immediately. if (block_cache_compressed != nullptr && raw_block != nullptr && raw_block->cachable()) { - auto cache_handle = block_cache_compressed->Insert( - compressed_block_cache_key, raw_block, raw_block->usable_size(), - &DeleteCachedEntry); - block_cache_compressed->Release(cache_handle); - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); - // Avoid the following code to delete this cached block. - raw_block = nullptr; + s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block, + raw_block->usable_size(), + &DeleteCachedEntry); + if (s.ok()) { + // Avoid the following code to delete this cached block. + raw_block = nullptr; + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } } delete raw_block; // insert into uncompressed block cache assert((block->value->compression_type() == kNoCompression)); if (block_cache != nullptr && block->value->cachable()) { - block->cache_handle = block_cache->Insert(block_cache_key, block->value, - block->value->usable_size(), - &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, - block->value->usable_size()); - assert(reinterpret_cast(block_cache->Value(block->cache_handle)) == - block->value); + s = block_cache->Insert(block_cache_key, block->value, + block->value->usable_size(), + &DeleteCachedEntry, &(block->cache_handle)); + if (s.ok()) { + assert(block->cache_handle != nullptr); + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, + block->value->usable_size()); + assert(reinterpret_cast( + block_cache->Value(block->cache_handle)) == block->value); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete block->value; + block->value = nullptr; + } } return s; } -FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep, size_t* filter_size) { +FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) { // TODO: We might want to unify with ReadBlockFromFile() if we start // requiring checksum verification in Table::Open. if (rep->filter_type == Rep::FilterType::kNoFilter) { @@ -815,29 +924,29 @@ FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep, size_t* filter_size) { } BlockContents block; if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(), - rep->filter_handle, &block, rep->ioptions.env, - false).ok()) { + rep->filter_handle, &block, rep->ioptions, + false /* decompress */, Slice() /*compression dict*/, + rep->persistent_cache_options) + .ok()) { // Error reading the block return nullptr; } - if (filter_size) { - *filter_size = block.data.size(); - } - assert(rep->filter_policy); if (rep->filter_type == Rep::FilterType::kBlockFilter) { return new BlockBasedFilterBlockReader( rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, - rep->table_options, rep->whole_key_filtering, std::move(block)); + rep->table_options, rep->whole_key_filtering, std::move(block), + rep->ioptions.statistics); } else if (rep->filter_type == Rep::FilterType::kFullFilter) { auto filter_bits_reader = rep->filter_policy->GetFilterBitsReader(block.data); if (filter_bits_reader != nullptr) { return new FullFilterBlockReader( rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), filter_bits_reader); + rep->whole_key_filtering, std::move(block), filter_bits_reader, + rep->ioptions.statistics); } } @@ -857,14 +966,19 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( return {rep_->filter.get(), nullptr /* cache handle */}; } - PERF_TIMER_GUARD(read_filter_block_nanos); - Cache* block_cache = rep_->table_options.block_cache.get(); if (rep_->filter_policy == nullptr /* do not use filter */ || block_cache == nullptr /* no block cache at all */) { return {nullptr /* filter */, nullptr /* cache handle */}; } + // we have a pinned filter block + if (rep_->filter_entry.IsSet()) { + return rep_->filter_entry; + } + + PERF_TIMER_GUARD(read_filter_block_nanos); + // Fetching from the cache char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, @@ -884,34 +998,48 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( // Do not invoke any io. return CachableEntry(); } else { - size_t filter_size = 0; - filter = ReadFilter(rep_, &filter_size); + filter = ReadFilter(rep_); if (filter != nullptr) { - assert(filter_size > 0); - cache_handle = block_cache->Insert(key, filter, filter_size, - &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size); + assert(filter->size() > 0); + Status s = block_cache->Insert(key, filter, filter->size(), + &DeleteCachedFilterEntry, &cache_handle); + if (s.ok()) { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter->size()); + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, filter->size()); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete filter; + return CachableEntry(); + } } } return { filter, cache_handle }; } -Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, - BlockIter* input_iter) { +InternalIterator* BlockBasedTable::NewIndexIterator( + const ReadOptions& read_options, BlockIter* input_iter, + CachableEntry* index_entry) { // index reader has already been pre-populated. if (rep_->index_reader) { return rep_->index_reader->NewIterator( input_iter, read_options.total_order_seek); } + // we have a pinned index block + if (rep_->index_entry.IsSet()) { + return rep_->index_entry.value->NewIterator(input_iter, + read_options.total_order_seek); + } + PERF_TIMER_GUARD(read_index_block_nanos); bool no_io = read_options.read_tier == kBlockCacheTier; Cache* block_cache = rep_->table_options.block_cache.get(); char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->footer.index_handle(), cache_key); + auto key = + GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->dummy_index_reader_offset, cache_key); Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, @@ -922,7 +1050,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, input_iter->SetStatus(Status::Incomplete("no blocking io")); return input_iter; } else { - return NewErrorIterator(Status::Incomplete("no blocking io")); + return NewErrorInternalIterator(Status::Incomplete("no blocking io")); } } @@ -933,31 +1061,50 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, } else { // Create index reader and put it in the cache. Status s; + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2"); s = CreateIndexReader(&index_reader); + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1"); + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3"); + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4"); + if (s.ok()) { + assert(index_reader != nullptr); + s = block_cache->Insert(key, index_reader, index_reader->usable_size(), + &DeleteCachedIndexEntry, &cache_handle); + } - if (!s.ok()) { + if (s.ok()) { + size_t usable_size = index_reader->usable_size(); + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usable_size); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usable_size); + } else { + if (index_reader != nullptr) { + delete index_reader; + } + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); // make sure if something goes wrong, index_reader shall remain intact. - assert(index_reader == nullptr); if (input_iter != nullptr) { input_iter->SetStatus(s); return input_iter; } else { - return NewErrorIterator(s); + return NewErrorInternalIterator(s); } } - cache_handle = - block_cache->Insert(key, index_reader, index_reader->usable_size(), - &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, - index_reader->usable_size()); } assert(cache_handle); auto* iter = index_reader->NewIterator( input_iter, read_options.total_order_seek); - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + + // the caller would like to take ownership of the index block + // don't call RegisterCleanup() in this case, the caller will take care of it + if (index_entry != nullptr) { + *index_entry = {index_reader, cache_handle}; + } else { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + } + return iter; } @@ -965,8 +1112,8 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, // into an iterator over the contents of the corresponding block. // If input_iter is null, new a iterator // If input_iter is not null, update this iter and return it -Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, - const ReadOptions& ro, const Slice& index_value, +InternalIterator* BlockBasedTable::NewDataBlockIterator( + Rep* rep, const ReadOptions& ro, const Slice& index_value, BlockIter* input_iter) { PERF_TIMER_GUARD(new_table_block_iter_nanos); @@ -987,10 +1134,14 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, input_iter->SetStatus(s); return input_iter; } else { - return NewErrorIterator(s); + return NewErrorInternalIterator(s); } } + Slice compression_dict; + if (rep->compression_dict_block) { + compression_dict = rep->compression_dict_block->data; + } // If either block cache is enabled, we'll try to read from it. if (block_cache != nullptr || block_cache_compressed != nullptr) { Statistics* statistics = rep->ioptions.statistics; @@ -1011,48 +1162,52 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, compressed_cache_key); } - s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - statistics, ro, &block, - rep->table_options.format_version); + s = GetDataBlockFromCache( + key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro, &block, + rep->table_options.format_version, compression_dict); if (block.value == nullptr && !no_io && ro.fill_cache) { std::unique_ptr raw_block; { StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &raw_block, rep->ioptions.env, - block_cache_compressed == nullptr); + &raw_block, rep->ioptions, + block_cache_compressed == nullptr, + compression_dict, rep->persistent_cache_options); } if (s.ok()) { s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, - ro, statistics, &block, raw_block.release(), - rep->table_options.format_version); + ro, rep->ioptions, &block, raw_block.release(), + rep->table_options.format_version, + compression_dict); } } } // Didn't get any data from block caches. - if (block.value == nullptr) { + if (s.ok() && block.value == nullptr) { if (no_io) { // Could not read from block_cache and can't do IO if (input_iter != nullptr) { input_iter->SetStatus(Status::Incomplete("no blocking io")); return input_iter; } else { - return NewErrorIterator(Status::Incomplete("no blocking io")); + return NewErrorInternalIterator(Status::Incomplete("no blocking io")); } } std::unique_ptr block_value; s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &block_value, rep->ioptions.env); + &block_value, rep->ioptions, true /* compress */, + compression_dict, rep->persistent_cache_options); if (s.ok()) { block.value = block_value.release(); } } - Iterator* iter; - if (block.value != nullptr) { + InternalIterator* iter; + if (s.ok()) { + assert(block.value != nullptr); iter = block.value->NewIterator(&rep->internal_comparator, input_iter); if (block.cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, @@ -1061,11 +1216,12 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); } } else { + assert(block.value == nullptr); if (input_iter != nullptr) { input_iter->SetStatus(s); iter = input_iter; } else { - iter = NewErrorIterator(s); + iter = NewErrorInternalIterator(s); } } return iter; @@ -1074,18 +1230,19 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState(BlockBasedTable* table, - const ReadOptions& read_options) - : TwoLevelIteratorState( - table->rep_->ioptions.prefix_extractor != nullptr), + const ReadOptions& read_options, bool skip_filters) + : TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != + nullptr), table_(table), - read_options_(read_options) {} + read_options_(read_options), + skip_filters_(skip_filters) {} - Iterator* NewSecondaryIterator(const Slice& index_value) override { + InternalIterator* NewSecondaryIterator(const Slice& index_value) override { return NewDataBlockIterator(table_->rep_, read_options_, index_value); } bool PrefixMayMatch(const Slice& internal_key) override { - if (read_options_.total_order_seek) { + if (read_options_.total_order_seek || skip_filters_) { return true; } return table_->PrefixMayMatch(internal_key); @@ -1095,6 +1252,7 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { // Don't own table_ BlockBasedTable* table_; const ReadOptions read_options_; + bool skip_filters_; }; // This will be broken if the user specifies an unusual implementation @@ -1115,9 +1273,12 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { } assert(rep_->ioptions.prefix_extractor != nullptr); - auto prefix = rep_->ioptions.prefix_extractor->Transform( - ExtractUserKey(internal_key)); - InternalKey internal_key_prefix(prefix, 0, kTypeValue); + auto user_key = ExtractUserKey(internal_key); + if (!rep_->ioptions.prefix_extractor->InDomain(user_key)) { + return true; + } + auto prefix = rep_->ioptions.prefix_extractor->Transform(user_key); + InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); bool may_match = true; @@ -1132,45 +1293,45 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { // First, try check with full filter auto filter_entry = GetFilter(true /* no io */); FilterBlockReader* filter = filter_entry.value; - if (filter != nullptr && !filter->IsBlockBased()) { - may_match = filter->PrefixMayMatch(prefix); - } - - // Then, try find it within each block - if (may_match) { - unique_ptr iiter(NewIndexIterator(no_io_read_options)); - iiter->Seek(internal_prefix); - - if (!iiter->Valid()) { - // we're past end of file - // if it's incomplete, it means that we avoided I/O - // and we're not really sure that we're past the end - // of the file - may_match = iiter->status().IsIncomplete(); - } else if (ExtractUserKey(iiter->key()).starts_with( - ExtractUserKey(internal_prefix))) { - // we need to check for this subtle case because our only - // guarantee is that "the key is a string >= last key in that data - // block" according to the doc/table_format.txt spec. - // - // Suppose iiter->key() starts with the desired prefix; it is not - // necessarily the case that the corresponding data block will - // contain the prefix, since iiter->key() need not be in the - // block. However, the next data block may contain the prefix, so - // we return true to play it safe. - may_match = true; - } else if (filter != nullptr && filter->IsBlockBased()) { - // iiter->key() does NOT start with the desired prefix. Because - // Seek() finds the first key that is >= the seek target, this - // means that iiter->key() > prefix. Thus, any data blocks coming - // after the data block corresponding to iiter->key() cannot - // possibly contain the key. Thus, the corresponding data block - // is the only on could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); - may_match = filter->PrefixMayMatch(prefix, handle.offset()); + if (filter != nullptr) { + if (!filter->IsBlockBased()) { + may_match = filter->PrefixMayMatch(prefix); + } else { + // Then, try find it within each block + unique_ptr iiter(NewIndexIterator(no_io_read_options)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()) + .starts_with(ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else if (filter->IsBlockBased()) { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only on could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + may_match = filter->PrefixMayMatch(prefix, handle.offset()); + } } } @@ -1180,26 +1341,36 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); } - filter_entry.Release(rep_->table_options.block_cache.get()); + // if rep_->filter_entry is not set, we should call Release(); otherwise + // don't call, in this case we have a local copy in rep_->filter_entry, + // it's pinned to the cache and will be released in the destructor + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return may_match; } -Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, - Arena* arena) { - return NewTwoLevelIterator(new BlockEntryIteratorState(this, read_options), - NewIndexIterator(read_options), arena); +InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, + Arena* arena, + bool skip_filters) { + return NewTwoLevelIterator( + new BlockEntryIteratorState(this, read_options, skip_filters), + NewIndexIterator(read_options), arena); } -bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter, +bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, const Slice& internal_key) const { if (filter == nullptr || filter->IsBlockBased()) { return true; } Slice user_key = ExtractUserKey(internal_key); - if (!filter->KeyMayMatch(user_key)) { - return false; + if (filter->whole_key_filtering()) { + return filter->KeyMayMatch(user_key); } - if (rep_->ioptions.prefix_extractor && + if (!read_options.total_order_seek && rep_->ioptions.prefix_extractor && + rep_->ioptions.prefix_extractor->InDomain(user_key) && !filter->PrefixMayMatch( rep_->ioptions.prefix_extractor->Transform(user_key))) { return false; @@ -1207,21 +1378,27 @@ bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter, return true; } -Status BlockBasedTable::Get( - const ReadOptions& read_options, const Slice& key, - GetContext* get_context) { +Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context, bool skip_filters) { Status s; - auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); + CachableEntry filter_entry; + if (!skip_filters) { + filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); + } FilterBlockReader* filter = filter_entry.value; // First check the full filter // If full filter not useful, Then go into each block - if (!FullFilterKeyMayMatch(filter, key)) { + if (!FullFilterKeyMayMatch(read_options, filter, key)) { RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); } else { BlockIter iiter; NewIndexIterator(read_options, &iiter); + PinnedIteratorsManager* pinned_iters_mgr = get_context->pinned_iters_mgr(); + bool pin_blocks = pinned_iters_mgr && pinned_iters_mgr->PinningEnabled(); + BlockIter* biter = nullptr; + bool done = false; for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { Slice handle_value = iiter.value(); @@ -1239,42 +1416,71 @@ Status BlockBasedTable::Get( RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); break; } else { - BlockIter biter; - NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); + BlockIter stack_biter; + if (pin_blocks) { + // We need to create the BlockIter on heap because we may need to + // pin it if we encounterd merge operands + biter = static_cast( + NewDataBlockIterator(rep_, read_options, iiter.value())); + } else { + biter = &stack_biter; + NewDataBlockIterator(rep_, read_options, iiter.value(), biter); + } - if (read_options.read_tier && biter.status().IsIncomplete()) { + if (read_options.read_tier == kBlockCacheTier && + biter->status().IsIncomplete()) { // couldn't get block from block_cache // Update Saver.state to Found because we are only looking for whether // we can guarantee the key is not there when "no_io" is set get_context->MarkKeyMayExist(); break; } - if (!biter.status().ok()) { - s = biter.status(); + if (!biter->status().ok()) { + s = biter->status(); break; } // Call the *saver function on each entry/block until it returns false - for (biter.Seek(key); biter.Valid(); biter.Next()) { + for (biter->Seek(key); biter->Valid(); biter->Next()) { ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { + if (!ParseInternalKey(biter->key(), &parsed_key)) { s = Status::Corruption(Slice()); } - if (!get_context->SaveValue(parsed_key, biter.value())) { + if (!get_context->SaveValue(parsed_key, biter->value(), pin_blocks)) { done = true; break; } } - s = biter.status(); + s = biter->status(); + + if (pin_blocks) { + if (get_context->State() == GetContext::kMerge) { + // Pin blocks as long as we are merging + pinned_iters_mgr->PinIteratorIfNeeded(biter); + } else { + delete biter; + } + biter = nullptr; + } else { + // biter is on stack, Nothing to clean + } } } + if (pin_blocks && biter != nullptr) { + delete biter; + } if (s.ok()) { s = iiter.status(); } } - filter_entry.Release(rep_->table_options.block_cache.get()); + // if rep_->filter_entry is not set, we should call Release(); otherwise + // don't call, in this case we have a local copy in rep_->filter_entry, + // it's pinned to the cache and will be released in the destructor + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } return s; } @@ -1326,7 +1532,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr iiter(NewIndexIterator(options)); + std::unique_ptr iiter(NewIndexIterator(options)); iiter->Seek(key); assert(iiter->Valid()); CachableEntry block; @@ -1344,9 +1550,12 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, handle, cache_key_storage); Slice ckey; - s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr, - options, &block, - rep_->table_options.format_version); + s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, + rep_->ioptions, options, &block, + rep_->table_options.format_version, + rep_->compression_dict_block + ? rep_->compression_dict_block->data + : Slice()); assert(s.ok()); bool in_cache = block.value != nullptr; if (in_cache) { @@ -1361,8 +1570,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, // 3. options // 4. internal_comparator // 5. index_type -Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, - Iterator* preloaded_meta_index_iter) { +Status BlockBasedTable::CreateIndexReader( + IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter) { // Some old version of block-based tables don't have index type present in // table properties. If that's the case we can safely use the kBinarySearch. auto index_type_on_file = BlockBasedTableOptions::kBinarySearch; @@ -1376,10 +1585,8 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } auto file = rep_->file.get(); - auto env = rep_->ioptions.env; auto comparator = &rep_->internal_comparator; const Footer& footer = rep_->footer; - if (index_type_on_file == BlockBasedTableOptions::kHashSearch && rep_->ioptions.prefix_extractor == nullptr) { Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, @@ -1392,11 +1599,12 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, switch (index_type_on_file) { case BlockBasedTableOptions::kBinarySearch: { return BinarySearchIndexReader::Create( - file, footer, footer.index_handle(), env, comparator, index_reader); + file, footer, footer.index_handle(), rep_->ioptions, comparator, + index_reader, rep_->persistent_cache_options); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; - std::unique_ptr meta_iter_guard; + std::unique_ptr meta_iter_guard; auto meta_index_iter = preloaded_meta_index_iter; if (meta_index_iter == nullptr) { auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard); @@ -1407,19 +1615,16 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, "Unable to read the metaindex block." " Fall back to binary search index."); return BinarySearchIndexReader::Create( - file, footer, footer.index_handle(), env, comparator, index_reader); + file, footer, footer.index_handle(), rep_->ioptions, comparator, + index_reader, rep_->persistent_cache_options); } meta_index_iter = meta_iter_guard.get(); } - // We need to wrap data with internal_prefix_transform to make sure it can - // handle prefix correctly. - rep_->internal_prefix_transform.reset( - new InternalKeySliceTransform(rep_->ioptions.prefix_extractor)); return HashIndexReader::Create( - rep_->internal_prefix_transform.get(), footer, file, env, comparator, - footer.index_handle(), meta_index_iter, index_reader, - rep_->hash_index_allow_collision); + rep_->internal_prefix_transform.get(), footer, file, rep_->ioptions, + comparator, footer.index_handle(), meta_index_iter, index_reader, + rep_->hash_index_allow_collision, rep_->persistent_cache_options); } default: { std::string error_message = @@ -1430,7 +1635,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { - unique_ptr index_iter(NewIndexIterator(ReadOptions())); + unique_ptr index_iter(NewIndexIterator(ReadOptions())); index_iter->Seek(key); uint64_t result; @@ -1484,7 +1689,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "Metaindex Details:\n" "--------------------------------------\n"); std::unique_ptr meta; - std::unique_ptr meta_iter; + std::unique_ptr meta_iter; Status s = ReadMetaBlock(rep_, &meta, &meta_iter); if (s.ok()) { for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { @@ -1496,6 +1701,10 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { out_file->Append(" Properties block handle: "); out_file->Append(meta_iter->value().ToString(true).c_str()); out_file->Append("\n"); + } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) { + out_file->Append(" Compression dictionary block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); } else if (strstr(meta_iter->key().ToString().c_str(), "filter.rocksdb.") != nullptr) { out_file->Append(" Filter block handle: "); @@ -1533,11 +1742,15 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { BlockHandle handle; if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { BlockContents block; - if (ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(), - handle, &block, rep_->ioptions.env, false).ok()) { + if (ReadBlockContents( + rep_->file.get(), rep_->footer, ReadOptions(), handle, &block, + rep_->ioptions, false /*decompress*/, + Slice() /*compression dict*/, rep_->persistent_cache_options) + .ok()) { rep_->filter.reset(new BlockBasedFilterBlockReader( rep_->ioptions.prefix_extractor, table_options, - table_options.whole_key_filtering, std::move(block))); + table_options.whole_key_filtering, std::move(block), + rep_->ioptions.statistics)); } } } @@ -1562,12 +1775,31 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { return s; } +void BlockBasedTable::Close() { + rep_->filter_entry.Release(rep_->table_options.block_cache.get()); + rep_->index_entry.Release(rep_->table_options.block_cache.get()); + // cleanup index and filter blocks to avoid accessing dangling pointer + if (!rep_->table_options.no_block_cache) { + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + // Get the filter block key + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->footer.metaindex_handle(), cache_key); + rep_->table_options.block_cache.get()->Erase(key); + // Get the index block key + key = GetCacheKeyFromOffset(rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + rep_->dummy_index_reader_offset, cache_key); + rep_->table_options.block_cache.get()->Erase(key); + } +} + Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" "--------------------------------------\n"); - std::unique_ptr blockhandles_iter(NewIndexIterator(ReadOptions())); + std::unique_ptr blockhandles_iter( + NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { out_file->Append("Can not read Index Block \n\n"); @@ -1608,7 +1840,8 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { } Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr blockhandles_iter(NewIndexIterator(ReadOptions())); + std::unique_ptr blockhandles_iter( + NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { out_file->Append("Can not read Index Block \n\n"); @@ -1630,7 +1863,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { out_file->Append("\n"); out_file->Append("--------------------------------------\n"); - std::unique_ptr datablock_iter; + std::unique_ptr datablock_iter; datablock_iter.reset( NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value())); s = datablock_iter->status(); @@ -1683,4 +1916,26 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { return Status::OK(); } +namespace { + +void DeleteCachedFilterEntry(const Slice& key, void* value) { + FilterBlockReader* filter = reinterpret_cast(value); + if (filter->statistics() != nullptr) { + RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, + filter->size()); + } + delete filter; +} + +void DeleteCachedIndexEntry(const Slice& key, void* value) { + IndexReader* index_reader = reinterpret_cast(value); + if (index_reader->statistics() != nullptr) { + RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT, + index_reader->usable_size()); + } + delete index_reader; +} + +} // anonymous namespace + } // namespace rocksdb diff --git a/external/rocksdb/table/block_based_table_reader.h b/external/rocksdb/table/block_based_table_reader.h index d81f610b80..9c8b1b1148 100644 --- a/external/rocksdb/table/block_based_table_reader.h +++ b/external/rocksdb/table/block_based_table_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,11 +15,12 @@ #include #include "rocksdb/options.h" +#include "rocksdb/persistent_cache.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/table_reader.h" #include "table/table_properties_internal.h" +#include "table/table_reader.h" #include "util/coding.h" #include "util/file_reader_writer.h" @@ -43,6 +44,7 @@ struct BlockBasedTableOptions; struct EnvOptions; struct ReadOptions; class GetContext; +class InternalIterator; using std::unique_ptr; @@ -53,6 +55,9 @@ class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; static const std::string kFullFilterBlockPrefix; + // The longest prefix of the cache key used to identify blocks. + // For Posix files the unique ID is three varints. + static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow @@ -63,29 +68,37 @@ class BlockBasedTable : public TableReader { // If there was an error while initializing the table, sets "*table_reader" // to nullptr and returns a non-ok status. // - // *file must remain live while this Table is in use. - // *prefetch_blocks can be used to disable prefetching of index and filter - // blocks at statup + // @param file must remain live while this Table is in use. + // @param prefetch_index_and_filter_in_cache can be used to disable + // prefetching of + // index and filter blocks into block cache at startup + // @param skip_filters Disables loading/accessing the filter block. Overrides + // prefetch_index_and_filter_in_cache, so filter will be skipped if both + // are set. static Status Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, - bool prefetch_index_and_filter = true); + bool prefetch_index_and_filter_in_cache = true, + bool skip_filters = false, int level = -1); bool PrefixMayMatch(const Slice& internal_key); // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). - Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; + // @param skip_filters Disables loading/accessing the filter block + InternalIterator* NewIterator(const ReadOptions&, Arena* arena = nullptr, + bool skip_filters = false) override; + // @param skip_filters Disables loading/accessing the filter block Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context) override; + GetContext* get_context, bool skip_filters = false) override; // Pre-fetch the disk blocks that correspond to the key range specified by - // (kbegin, kend). The call will return return error status in the event of + // (kbegin, kend). The call will return error status in the event of // IO or iteration error. Status Prefetch(const Slice* begin, const Slice* end) override; @@ -112,6 +125,8 @@ class BlockBasedTable : public TableReader { // convert SST file to a human readable form Status DumpTable(WritableFile* out_file) override; + void Close() override; + ~BlockBasedTable(); bool TEST_filter_block_preloaded() const; @@ -119,6 +134,10 @@ class BlockBasedTable : public TableReader { // Implementation of IndexReader will be exposed to internal cc file only. class IndexReader; + static Slice GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key); + private: template struct CachableEntry; @@ -129,9 +148,9 @@ class BlockBasedTable : public TableReader { class BlockEntryIteratorState; // input_iter: if it is not null, update this one and return it as Iterator - static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, - const Slice& index_value, - BlockIter* input_iter = nullptr); + static InternalIterator* NewDataBlockIterator( + Rep* rep, const ReadOptions& ro, const Slice& index_value, + BlockIter* input_iter = nullptr); // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file @@ -148,18 +167,23 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - Iterator* NewIndexIterator(const ReadOptions& read_options, - BlockIter* input_iter = nullptr); + InternalIterator* NewIndexIterator( + const ReadOptions& read_options, BlockIter* input_iter = nullptr, + CachableEntry* index_entry = nullptr); // Read block cache from block caches (if set): block_cache and // block_cache_compressed. // On success, Status::OK with be returned and @block will be populated with // pointer to the block as well as its block handle. + // @param compression_dict Data for presetting the compression library's + // dictionary. static Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, - const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block, uint32_t format_version); + Cache* block_cache, Cache* block_cache_compressed, + const ImmutableCFOptions &ioptions, const ReadOptions& read_options, + BlockBasedTable::CachableEntry* block, uint32_t format_version, + const Slice& compression_dict); + // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then // populate the block caches. @@ -168,11 +192,14 @@ class BlockBasedTable : public TableReader { // // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be // responsible for releasing its memory if error occurs. + // @param compression_dict Data for presetting the compression library's + // dictionary. static Status PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, Statistics* statistics, - CachableEntry* block, Block* raw_block, uint32_t format_version); + const ReadOptions& read_options, const ImmutableCFOptions &ioptions, + CachableEntry* block, Block* raw_block, uint32_t format_version, + const Slice& compression_dict); // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -186,22 +213,22 @@ class BlockBasedTable : public TableReader { // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter // helps avoid re-reading meta index block if caller already created one. - Status CreateIndexReader(IndexReader** index_reader, - Iterator* preloaded_meta_index_iter = nullptr); + Status CreateIndexReader( + IndexReader** index_reader, + InternalIterator* preloaded_meta_index_iter = nullptr); - bool FullFilterKeyMayMatch(FilterBlockReader* filter, + bool FullFilterKeyMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, const Slice& user_key) const; // Read the meta block from sst. - static Status ReadMetaBlock( - Rep* rep, - std::unique_ptr* meta_block, - std::unique_ptr* iter); + static Status ReadMetaBlock(Rep* rep, std::unique_ptr* meta_block, + std::unique_ptr* iter); // Create the filter from the filter block. - static FilterBlockReader* ReadFilter(Rep* rep, size_t* filter_size = nullptr); + static FilterBlockReader* ReadFilter(Rep* rep); - static void SetupCacheKeyPrefix(Rep* rep); + static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); explicit BlockBasedTable(Rep* rep) : rep_(rep), compaction_optimized_(false) {} @@ -212,10 +239,6 @@ class BlockBasedTable : public TableReader { static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer, size_t* size); - // The longest prefix of the cache key used to identify blocks. - // For Posix files the unique ID is three varints. - static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; - // Helper functions for DumpTable() Status DumpIndexBlock(WritableFile* out_file); Status DumpDataBlocks(WritableFile* out_file); diff --git a/external/rocksdb/table/block_builder.cc b/external/rocksdb/table/block_builder.cc index 1eee96d468..10901aaa02 100644 --- a/external/rocksdb/table/block_builder.cc +++ b/external/rocksdb/table/block_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -41,30 +41,27 @@ namespace rocksdb { -BlockBuilder::BlockBuilder(int block_restart_interval) +BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding) : block_restart_interval_(block_restart_interval), + use_delta_encoding_(use_delta_encoding), restarts_(), counter_(0), finished_(false) { assert(block_restart_interval_ >= 1); restarts_.push_back(0); // First restart point is at offset 0 + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); } void BlockBuilder::Reset() { buffer_.clear(); restarts_.clear(); restarts_.push_back(0); // First restart point is at offset 0 + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); counter_ = 0; finished_ = false; last_key_.clear(); } -size_t BlockBuilder::CurrentSizeEstimate() const { - return (buffer_.size() + // Raw data buffer - restarts_.size() * sizeof(uint32_t) + // Restart array - sizeof(uint32_t)); // Restart array length -} - size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) const { size_t estimate = CurrentSizeEstimate(); @@ -91,37 +88,44 @@ Slice BlockBuilder::Finish() { } void BlockBuilder::Add(const Slice& key, const Slice& value) { - Slice last_key_piece(last_key_); assert(!finished_); assert(counter_ <= block_restart_interval_); - size_t shared = 0; - if (counter_ < block_restart_interval_) { - // See how much sharing to do with previous string - const size_t min_length = std::min(last_key_piece.size(), key.size()); - while ((shared < min_length) && (last_key_piece[shared] == key[shared])) { - shared++; - } - } else { + size_t shared = 0; // number of bytes shared with prev key + if (counter_ >= block_restart_interval_) { // Restart compression restarts_.push_back(static_cast(buffer_.size())); + estimate_ += sizeof(uint32_t); counter_ = 0; + + if (use_delta_encoding_) { + // Update state + last_key_.assign(key.data(), key.size()); + } + } else if (use_delta_encoding_) { + Slice last_key_piece(last_key_); + // See how much sharing to do with previous string + shared = key.difference_offset(last_key_piece); + + // Update state + // We used to just copy the changed data here, but it appears to be + // faster to just copy the whole thing. + last_key_.assign(key.data(), key.size()); } + const size_t non_shared = key.size() - shared; + const size_t curr_size = buffer_.size(); // Add "" to buffer_ - PutVarint32(&buffer_, static_cast(shared)); - PutVarint32(&buffer_, static_cast(non_shared)); - PutVarint32(&buffer_, static_cast(value.size())); + PutVarint32Varint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared), + static_cast(value.size())); // Add string delta to buffer_ followed by value buffer_.append(key.data() + shared, non_shared); buffer_.append(value.data(), value.size()); - // Update state - last_key_.resize(shared); - last_key_.append(key.data() + shared, non_shared); - assert(Slice(last_key_) == key); counter_++; + estimate_ += buffer_.size() - curr_size; } } // namespace rocksdb diff --git a/external/rocksdb/table/block_builder.h b/external/rocksdb/table/block_builder.h index c01a23bea9..898e1ade21 100644 --- a/external/rocksdb/table/block_builder.h +++ b/external/rocksdb/table/block_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,7 +20,8 @@ class BlockBuilder { BlockBuilder(const BlockBuilder&) = delete; void operator=(const BlockBuilder&) = delete; - explicit BlockBuilder(int block_restart_interval); + explicit BlockBuilder(int block_restart_interval, + bool use_delta_encoding = true); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); @@ -36,7 +37,7 @@ class BlockBuilder { // Returns an estimate of the current (uncompressed) size of the block // we are building. - size_t CurrentSizeEstimate() const; + inline size_t CurrentSizeEstimate() const { return estimate_; } // Returns an estimated block size after appending key and value. size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; @@ -48,9 +49,11 @@ class BlockBuilder { private: const int block_restart_interval_; + const bool use_delta_encoding_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points + size_t estimate_; int counter_; // Number of entries emitted since restart bool finished_; // Has Finish() been called? std::string last_key_; diff --git a/external/rocksdb/table/block_hash_index.cc b/external/rocksdb/table/block_hash_index.cc deleted file mode 100644 index fd1329660a..0000000000 --- a/external/rocksdb/table/block_hash_index.cc +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "table/block_hash_index.h" - -#include - -#include "rocksdb/comparator.h" -#include "rocksdb/iterator.h" -#include "rocksdb/slice_transform.h" -#include "util/coding.h" - -namespace rocksdb { - -Status CreateBlockHashIndex(const SliceTransform* hash_key_extractor, - const Slice& prefixes, const Slice& prefix_meta, - BlockHashIndex** hash_index) { - uint64_t pos = 0; - auto meta_pos = prefix_meta; - Status s; - *hash_index = new BlockHashIndex( - hash_key_extractor, - false /* external module manages memory space for prefixes */); - - while (!meta_pos.empty()) { - uint32_t prefix_size = 0; - uint32_t entry_index = 0; - uint32_t num_blocks = 0; - if (!GetVarint32(&meta_pos, &prefix_size) || - !GetVarint32(&meta_pos, &entry_index) || - !GetVarint32(&meta_pos, &num_blocks)) { - s = Status::Corruption( - "Corrupted prefix meta block: unable to read from it."); - break; - } - Slice prefix(prefixes.data() + pos, prefix_size); - (*hash_index)->Add(prefix, entry_index, num_blocks); - - pos += prefix_size; - } - - if (s.ok() && pos != prefixes.size()) { - s = Status::Corruption("Corrupted prefix meta block"); - } - - if (!s.ok()) { - delete *hash_index; - } - - return s; -} - -BlockHashIndex* CreateBlockHashIndexOnTheFly( - Iterator* index_iter, Iterator* data_iter, const uint32_t num_restarts, - const Comparator* comparator, const SliceTransform* hash_key_extractor) { - assert(hash_key_extractor); - auto hash_index = new BlockHashIndex( - hash_key_extractor, - true /* hash_index will copy prefix when Add() is called */); - uint32_t current_restart_index = 0; - - std::string pending_entry_prefix; - // pending_block_num == 0 also implies there is no entry inserted at all. - uint32_t pending_block_num = 0; - uint32_t pending_entry_index = 0; - - // scan all the entries and create a hash index based on their prefixes. - data_iter->SeekToFirst(); - for (index_iter->SeekToFirst(); - index_iter->Valid() && current_restart_index < num_restarts; - index_iter->Next()) { - Slice last_key_in_block = index_iter->key(); - assert(data_iter->Valid() && data_iter->status().ok()); - - // scan through all entries within a data block. - while (data_iter->Valid() && - comparator->Compare(data_iter->key(), last_key_in_block) <= 0) { - auto key_prefix = hash_key_extractor->Transform(data_iter->key()); - bool is_first_entry = pending_block_num == 0; - - // Keys may share the prefix - if (is_first_entry || pending_entry_prefix != key_prefix) { - if (!is_first_entry) { - bool succeeded = hash_index->Add( - pending_entry_prefix, pending_entry_index, pending_block_num); - if (!succeeded) { - delete hash_index; - return nullptr; - } - } - - // update the status. - // needs a hard copy otherwise the underlying data changes all the time. - pending_entry_prefix = key_prefix.ToString(); - pending_block_num = 1; - pending_entry_index = current_restart_index; - } else { - // entry number increments when keys share the prefix reside in - // different data blocks. - auto last_restart_index = pending_entry_index + pending_block_num - 1; - assert(last_restart_index <= current_restart_index); - if (last_restart_index != current_restart_index) { - ++pending_block_num; - } - } - data_iter->Next(); - } - - ++current_restart_index; - } - - // make sure all entries has been scaned. - assert(!index_iter->Valid()); - assert(!data_iter->Valid()); - - if (pending_block_num > 0) { - auto succeeded = hash_index->Add(pending_entry_prefix, pending_entry_index, - pending_block_num); - if (!succeeded) { - delete hash_index; - return nullptr; - } - } - - return hash_index; -} - -bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index, - uint32_t num_blocks) { - auto prefix_to_insert = prefix; - if (kOwnPrefixes) { - auto prefix_ptr = arena_.Allocate(prefix.size()); - // MSVC reports C4996 Function call with parameters that may be - // unsafe when using std::copy with a output iterator - pointer - memcpy(prefix_ptr, prefix.data(), prefix.size()); - prefix_to_insert = Slice(prefix_ptr, prefix.size()); - } - auto result = restart_indices_.insert( - {prefix_to_insert, RestartIndex(restart_index, num_blocks)}); - return result.second; -} - -const BlockHashIndex::RestartIndex* BlockHashIndex::GetRestartIndex( - const Slice& key) { - auto key_prefix = hash_key_extractor_->Transform(key); - - auto pos = restart_indices_.find(key_prefix); - if (pos == restart_indices_.end()) { - return nullptr; - } - - return &pos->second; -} - -} // namespace rocksdb diff --git a/external/rocksdb/table/block_hash_index.h b/external/rocksdb/table/block_hash_index.h deleted file mode 100644 index 5829107967..0000000000 --- a/external/rocksdb/table/block_hash_index.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -#pragma once - -#include -#include - -#include "rocksdb/status.h" -#include "util/arena.h" -#include "util/murmurhash.h" - -namespace rocksdb { - -class Comparator; -class Iterator; -class Slice; -class SliceTransform; - -// Build a hash-based index to speed up the lookup for "index block". -// BlockHashIndex accepts a key and, if found, returns its restart index within -// that index block. -class BlockHashIndex { - public: - // Represents a restart index in the index block's restart array. - struct RestartIndex { - explicit RestartIndex(uint32_t _first_index, uint32_t _num_blocks = 1) - : first_index(_first_index), num_blocks(_num_blocks) {} - - // For a given prefix, what is the restart index for the first data block - // that contains it. - uint32_t first_index = 0; - - // How many data blocks contains this prefix? - uint32_t num_blocks = 1; - }; - - // @params own_prefixes indicate if we should take care the memory space for - // the `key_prefix` - // passed by Add() - explicit BlockHashIndex(const SliceTransform* hash_key_extractor, - bool own_prefixes) - : hash_key_extractor_(hash_key_extractor), kOwnPrefixes(own_prefixes) {} - - // Maps a key to its restart first_index. - // Returns nullptr if the restart first_index is found - const RestartIndex* GetRestartIndex(const Slice& key); - - bool Add(const Slice& key_prefix, uint32_t restart_index, - uint32_t num_blocks); - - size_t ApproximateMemoryUsage() const { - return arena_.ApproximateMemoryUsage(); - } - - private: - const SliceTransform* hash_key_extractor_; - std::unordered_map restart_indices_; - - Arena arena_; - bool kOwnPrefixes; -}; - -// Create hash index by reading from the metadata blocks. -// @params prefixes: a sequence of prefixes. -// @params prefix_meta: contains the "metadata" to of the prefixes. -Status CreateBlockHashIndex(const SliceTransform* hash_key_extractor, - const Slice& prefixes, const Slice& prefix_meta, - BlockHashIndex** hash_index); - -// Create hash index by scanning the entries in index as well as the whole -// dataset. -// @params index_iter: an iterator with the pointer to the first entry in a -// block. -// @params data_iter: an iterator that can scan all the entries reside in a -// table. -// @params num_restarts: used for correctness verification. -// @params hash_key_extractor: extract the hashable part of a given key. -// On error, nullptr will be returned. -BlockHashIndex* CreateBlockHashIndexOnTheFly( - Iterator* index_iter, Iterator* data_iter, const uint32_t num_restarts, - const Comparator* comparator, const SliceTransform* hash_key_extractor); - -} // namespace rocksdb diff --git a/external/rocksdb/table/block_hash_index_test.cc b/external/rocksdb/table/block_hash_index_test.cc deleted file mode 100644 index b001c203a4..0000000000 --- a/external/rocksdb/table/block_hash_index_test.cc +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include -#include -#include - -#include "rocksdb/comparator.h" -#include "rocksdb/iterator.h" -#include "rocksdb/slice_transform.h" -#include "table/block_hash_index.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -typedef std::map Data; - -class MapIterator : public Iterator { - public: - explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {} - - virtual bool Valid() const override { return pos_ != data_.end(); } - - virtual void SeekToFirst() override { pos_ = data_.begin(); } - - virtual void SeekToLast() override { - pos_ = data_.end(); - --pos_; - } - - virtual void Seek(const Slice& target) override { - pos_ = data_.find(target.ToString()); - } - - virtual void Next() override { ++pos_; } - - virtual void Prev() override { --pos_; } - - virtual Slice key() const override { return pos_->first; } - - virtual Slice value() const override { return pos_->second; } - - virtual Status status() const override { return Status::OK(); } - - private: - const Data& data_; - Data::const_iterator pos_; -}; - -class BlockTest : public testing::Test {}; - -TEST_F(BlockTest, BasicTest) { - const size_t keys_per_block = 4; - const size_t prefix_size = 2; - std::vector keys = {/* block 1 */ - "0101", "0102", "0103", "0201", - /* block 2 */ - "0202", "0203", "0301", "0401", - /* block 3 */ - "0501", "0601", "0701", "0801", - /* block 4 */ - "0802", "0803", "0804", "0805", - /* block 5 */ - "0806", "0807", "0808", "0809", }; - - Data data_entries; - for (const auto key : keys) { - data_entries.insert({key, key}); - } - - Data index_entries; - for (size_t i = 3; i < keys.size(); i += keys_per_block) { - // simply ignore the value part - index_entries.insert({keys[i], ""}); - } - - MapIterator data_iter(data_entries); - MapIterator index_iter(index_entries); - - auto prefix_extractor = NewFixedPrefixTransform(prefix_size); - std::unique_ptr block_hash_index(CreateBlockHashIndexOnTheFly( - &index_iter, &data_iter, static_cast(index_entries.size()), - BytewiseComparator(), prefix_extractor)); - - std::map expected = { - {"01xx", BlockHashIndex::RestartIndex(0, 1)}, - {"02yy", BlockHashIndex::RestartIndex(0, 2)}, - {"03zz", BlockHashIndex::RestartIndex(1, 1)}, - {"04pp", BlockHashIndex::RestartIndex(1, 1)}, - {"05ww", BlockHashIndex::RestartIndex(2, 1)}, - {"06xx", BlockHashIndex::RestartIndex(2, 1)}, - {"07pp", BlockHashIndex::RestartIndex(2, 1)}, - {"08xz", BlockHashIndex::RestartIndex(2, 3)}, }; - - const BlockHashIndex::RestartIndex* index = nullptr; - // search existed prefixes - for (const auto& item : expected) { - index = block_hash_index->GetRestartIndex(item.first); - ASSERT_TRUE(index != nullptr); - ASSERT_EQ(item.second.first_index, index->first_index); - ASSERT_EQ(item.second.num_blocks, index->num_blocks); - } - - // search non exist prefixes - ASSERT_TRUE(!block_hash_index->GetRestartIndex("00xx")); - ASSERT_TRUE(!block_hash_index->GetRestartIndex("10yy")); - ASSERT_TRUE(!block_hash_index->GetRestartIndex("20zz")); - - delete prefix_extractor; -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/external/rocksdb/table/block_prefix_index.cc b/external/rocksdb/table/block_prefix_index.cc index 147bcf56e4..10fcb05754 100644 --- a/external/rocksdb/table/block_prefix_index.cc +++ b/external/rocksdb/table/block_prefix_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -136,6 +136,7 @@ class BlockPrefixIndex::Builder { assert(prefixes_per_bucket[i]->next == nullptr); buckets[i] = prefixes_per_bucket[i]->start_block; } else { + assert(total_block_array_entries > 0); assert(prefixes_per_bucket[i] != nullptr); buckets[i] = EncodeIndex(offset); block_array_buffer[offset] = num_blocks; diff --git a/external/rocksdb/table/block_prefix_index.h b/external/rocksdb/table/block_prefix_index.h index bc36c48f6a..d9c3b97e0a 100644 --- a/external/rocksdb/table/block_prefix_index.h +++ b/external/rocksdb/table/block_prefix_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/table/block_test.cc b/external/rocksdb/table/block_test.cc index c86f38da5d..424df87a3f 100644 --- a/external/rocksdb/table/block_test.cc +++ b/external/rocksdb/table/block_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,7 +18,6 @@ #include "table/block.h" #include "table/block_builder.h" #include "table/format.h" -#include "table/block_hash_index.h" #include "util/random.h" #include "util/testharness.h" #include "util/testutil.h" @@ -96,7 +95,7 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; - Iterator* iter = reader.NewIterator(options.comparator); + InternalIterator *iter = reader.NewIterator(options.comparator); for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) { // read kv from block @@ -159,30 +158,16 @@ void CheckBlockContents(BlockContents contents, const int max_key, std::unique_ptr prefix_extractor( NewFixedPrefixTransform(prefix_size)); - { - auto iter1 = reader1.NewIterator(nullptr); - auto iter2 = reader1.NewIterator(nullptr); - reader1.SetBlockHashIndex(CreateBlockHashIndexOnTheFly( - iter1, iter2, static_cast(keys.size()), BytewiseComparator(), - prefix_extractor.get())); - - delete iter1; - delete iter2; - } - - std::unique_ptr hash_iter( - reader1.NewIterator(BytewiseComparator(), nullptr, false)); - - std::unique_ptr regular_iter( + std::unique_ptr regular_iter( reader2.NewIterator(BytewiseComparator())); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { - hash_iter->Seek(keys[i]); - ASSERT_OK(hash_iter->status()); - ASSERT_TRUE(hash_iter->Valid()); + regular_iter->Seek(keys[i]); + ASSERT_OK(regular_iter->status()); + ASSERT_TRUE(regular_iter->Valid()); - Slice v = hash_iter->value(); + Slice v = regular_iter->value(); ASSERT_EQ(v.ToString().compare(values[i]), 0); } @@ -192,9 +177,6 @@ void CheckBlockContents(BlockContents contents, const int max_key, // return the one that is closest. for (int i = 1; i < max_key - 1; i += 2) { auto key = GenerateKey(i, 0, 0, nullptr); - hash_iter->Seek(key); - ASSERT_TRUE(!hash_iter->Valid()); - regular_iter->Seek(key); ASSERT_TRUE(regular_iter->Valid()); } diff --git a/external/rocksdb/table/bloom_block.cc b/external/rocksdb/table/bloom_block.cc index cfea8a2c5d..7eef9cc05b 100644 --- a/external/rocksdb/table/bloom_block.cc +++ b/external/rocksdb/table/bloom_block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/table/bloom_block.h b/external/rocksdb/table/bloom_block.h index 5b60d2bca5..5ba74601fc 100644 --- a/external/rocksdb/table/bloom_block.h +++ b/external/rocksdb/table/bloom_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/table/cuckoo_table_builder.cc b/external/rocksdb/table/cuckoo_table_builder.cc index 946a8b5fbd..0f93b54edf 100644 --- a/external/rocksdb/table/cuckoo_table_builder.cc +++ b/external/rocksdb/table/cuckoo_table_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -52,7 +52,8 @@ CuckooTableBuilder::CuckooTableBuilder( uint32_t max_num_hash_table, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, bool use_module_hash, bool identity_as_first_hash, - uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t), + uint32_t column_family_id, const std::string& column_family_name) : num_hash_func_(2), file_(file), max_hash_table_ratio_(max_hash_table_ratio), @@ -76,6 +77,8 @@ CuckooTableBuilder::CuckooTableBuilder( properties_.num_data_blocks = 1; properties_.index_size = 0; properties_.filter_size = 0; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { @@ -109,8 +112,6 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { status_ = Status::NotSupported("all keys have to be the same size"); return; } - // Even if one sequence number is non-zero, then it is not last level. - assert(!is_last_level_file_ || ikey.sequence == 0); if (ikey.type == kTypeValue) { if (!has_seen_first_value_) { @@ -248,7 +249,8 @@ Status CuckooTableBuilder::Finish() { if (num_entries_ > 0) { // Calculate the real hash size if module hash is enabled. if (use_module_hash_) { - hash_table_size_ = num_entries_ / max_hash_table_ratio_; + hash_table_size_ = + static_cast(num_entries_ / max_hash_table_ratio_); } s = MakeHashTable(&buckets); if (!s.ok()) { @@ -404,7 +406,8 @@ uint64_t CuckooTableBuilder::FileSize() const { } if (use_module_hash_) { - return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_; + return static_cast((key_size_ + value_size_) * + num_entries_ / max_hash_table_ratio_); } else { // Account for buckets being a power of two. // As elements are added, file size remains constant for a while and diff --git a/external/rocksdb/table/cuckoo_table_builder.h b/external/rocksdb/table/cuckoo_table_builder.h index 093e1c245e..266a71be04 100644 --- a/external/rocksdb/table/cuckoo_table_builder.h +++ b/external/rocksdb/table/cuckoo_table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -27,7 +27,9 @@ class CuckooTableBuilder: public TableBuilder { uint32_t cuckoo_block_size, bool use_module_hash, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, - uint64_t)); + uint64_t), + uint32_t column_family_id, + const std::string& column_family_name); // REQUIRES: Either Finish() or Abandon() has been called. ~CuckooTableBuilder() {} diff --git a/external/rocksdb/table/cuckoo_table_builder_test.cc b/external/rocksdb/table/cuckoo_table_builder_test.cc index 2ee87fb1ee..ac5bed1575 100644 --- a/external/rocksdb/table/cuckoo_table_builder_test.cc +++ b/external/rocksdb/table/cuckoo_table_builder_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -49,12 +49,16 @@ class CuckooBuilderTest : public testing::Test { uint64_t read_file_size; ASSERT_OK(env_->GetFileSize(fname, &read_file_size)); + Options options; + options.allow_mmap_reads = true; + ImmutableCFOptions ioptions(options); + // Assert Table Properties. TableProperties* props = nullptr; unique_ptr file_reader( new RandomAccessFileReader(std::move(read_file))); ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, - kCuckooTableMagicNumber, env_, nullptr, + kCuckooTableMagicNumber, ioptions, &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[ @@ -89,6 +93,8 @@ class CuckooBuilderTest : public testing::Test { ASSERT_EQ(props->data_size, expected_unused_bucket.size() * (expected_table_size + expected_cuckoo_block_size - 1)); ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); + ASSERT_EQ(props->column_family_id, 0); + ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName); delete props; // Check contents of the bucket. @@ -129,6 +135,11 @@ class CuckooBuilderTest : public testing::Test { return n; } + uint64_t GetExpectedTableSize(uint64_t num) { + return NextPowOf2(static_cast(num / kHashTableRatio)); + } + + Env* env_; EnvOptions env_options_; std::string fname; @@ -143,7 +154,8 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); @@ -169,7 +181,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; @@ -178,7 +190,8 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -216,7 +229,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; @@ -225,7 +238,8 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -263,7 +277,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); unique_ptr writable_file; uint32_t cuckoo_block_size = 2; @@ -271,9 +285,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( new WritableFileWriter(std::move(writable_file), EnvOptions())); - CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, - 100, BytewiseComparator(), cuckoo_block_size, - false, false, GetSliceHash); + CuckooTableBuilder builder( + file_writer.get(), kHashTableRatio, num_hash_fun, 100, + BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash, + 0 /* column_family_id */, kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -316,7 +331,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; @@ -325,7 +340,8 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -365,7 +381,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; @@ -374,7 +390,8 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 2, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -407,7 +424,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { hash_map = std::move(hm); std::vector expected_locations = {0, 1, 2, 3}; - uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; @@ -416,7 +433,8 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -450,7 +468,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { hash_map = std::move(hm); std::vector expected_locations = {0, 1, 2, 3}; - uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; @@ -459,7 +477,8 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -495,7 +514,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { hash_map = std::move(hm); std::vector expected_locations = {0, 1, 3, 4, 2}; - uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); + uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; @@ -504,7 +523,8 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -548,7 +568,8 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); @@ -575,7 +596,8 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { new WritableFileWriter(std::move(writable_file), EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, - GetSliceHash); + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); diff --git a/external/rocksdb/table/cuckoo_table_factory.cc b/external/rocksdb/table/cuckoo_table_factory.cc index 16bf3fbe50..87107ee7fc 100644 --- a/external/rocksdb/table/cuckoo_table_factory.cc +++ b/external/rocksdb/table/cuckoo_table_factory.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,7 +15,8 @@ namespace rocksdb { Status CuckooTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, - std::unique_ptr* table) const { + std::unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const { std::unique_ptr new_reader(new CuckooTableReader( table_reader_options.ioptions, std::move(file), file_size, table_reader_options.internal_comparator.user_comparator(), nullptr)); @@ -27,7 +28,7 @@ Status CuckooTableFactory::NewTableReader( } TableBuilder* CuckooTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, WritableFileWriter* file) const { // Ignore the skipFIlters flag. Does not apply to this file format // @@ -38,7 +39,8 @@ TableBuilder* CuckooTableFactory::NewTableBuilder( table_options_.max_search_depth, table_builder_options.internal_comparator.user_comparator(), table_options_.cuckoo_block_size, table_options_.use_module_hash, - table_options_.identity_as_first_hash, nullptr); + table_options_.identity_as_first_hash, nullptr /* get_slice_hash */, + column_family_id, table_builder_options.column_family_name); } std::string CuckooTableFactory::GetPrintableTableOptions() const { diff --git a/external/rocksdb/table/cuckoo_table_factory.h b/external/rocksdb/table/cuckoo_table_factory.h index 394e834fa3..ff67ae2476 100644 --- a/external/rocksdb/table/cuckoo_table_factory.h +++ b/external/rocksdb/table/cuckoo_table_factory.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -55,14 +55,15 @@ class CuckooTableFactory : public TableFactory { const char* Name() const override { return "CuckooTable"; } - Status NewTableReader(const TableReaderOptions& table_reader_options, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader( + const TableReaderOptions& table_reader_options, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table, + bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFileWriter* file) const override; + uint32_t column_family_id, WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, @@ -72,8 +73,10 @@ class CuckooTableFactory : public TableFactory { std::string GetPrintableTableOptions() const override; + void* GetOptions() override { return &table_options_; } + private: - const CuckooTableOptions table_options_; + CuckooTableOptions table_options_; }; } // namespace rocksdb diff --git a/external/rocksdb/table/cuckoo_table_reader.cc b/external/rocksdb/table/cuckoo_table_reader.cc index 8c0329c66b..f6d69154ef 100644 --- a/external/rocksdb/table/cuckoo_table_reader.cc +++ b/external/rocksdb/table/cuckoo_table_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,6 +17,7 @@ #include #include "rocksdb/iterator.h" #include "rocksdb/table.h" +#include "table/internal_iterator.h" #include "table/meta_blocks.h" #include "table/cuckoo_table_factory.h" #include "table/get_context.h" @@ -44,7 +45,7 @@ CuckooTableReader::CuckooTableReader( } TableProperties* props = nullptr; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, - ioptions.env, ioptions.info_log, &props); + ioptions, &props); if (!status_.ok()) { return; } @@ -127,7 +128,7 @@ CuckooTableReader::CuckooTableReader( } Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context) { + GetContext* get_context, bool skip_filters) { assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); Slice user_key = ExtractUserKey(key); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { @@ -142,11 +143,16 @@ Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, return Status::OK(); } // Here, we compare only the user key part as we support only one entry - // per user key and we don't support sanpshot. + // per user key and we don't support snapshot. if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { Slice value(bucket + key_length_, value_length_); if (is_last_level_) { - get_context->SaveValue(value); + // Sequence number is not stored at the last level, so we will use + // kMaxSequenceNumber since it is unknown. This could cause some + // transactions to fail to lock a key due to known sequence number. + // However, it is expected for anyone to use a CuckooTable in a + // TransactionDB. + get_context->SaveValue(value, kMaxSequenceNumber); } else { Slice full_key(bucket, key_length_); ParsedInternalKey found_ikey; @@ -173,7 +179,7 @@ void CuckooTableReader::Prepare(const Slice& key) { } } -class CuckooTableIterator : public Iterator { +class CuckooTableIterator : public InternalIterator { public: explicit CuckooTableIterator(CuckooTableReader* reader); ~CuckooTableIterator() {} @@ -348,16 +354,17 @@ Slice CuckooTableIterator::value() const { return curr_value_; } -extern Iterator* NewErrorIterator(const Status& status, Arena* arena); +extern InternalIterator* NewErrorInternalIterator(const Status& status, + Arena* arena); -Iterator* CuckooTableReader::NewIterator( - const ReadOptions& read_options, Arena* arena) { +InternalIterator* CuckooTableReader::NewIterator( + const ReadOptions& read_options, Arena* arena, bool skip_filters) { if (!status().ok()) { - return NewErrorIterator( + return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); } if (read_options.total_order_seek) { - return NewErrorIterator( + return NewErrorInternalIterator( Status::InvalidArgument("total_order_seek is not supported."), arena); } CuckooTableIterator* iter; diff --git a/external/rocksdb/table/cuckoo_table_reader.h b/external/rocksdb/table/cuckoo_table_reader.h index 6643be025b..5e3e5528a9 100644 --- a/external/rocksdb/table/cuckoo_table_reader.h +++ b/external/rocksdb/table/cuckoo_table_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,6 +24,7 @@ namespace rocksdb { class Arena; class TableReader; +class InternalIterator; class CuckooTableReader: public TableReader { public: @@ -41,9 +42,10 @@ class CuckooTableReader: public TableReader { Status status() const { return status_; } Status Get(const ReadOptions& read_options, const Slice& key, - GetContext* get_context) override; + GetContext* get_context, bool skip_filters = false) override; - Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; + InternalIterator* NewIterator(const ReadOptions&, Arena* arena = nullptr, + bool skip_filters = false) override; void Prepare(const Slice& target) override; // Report an approximation of how much memory has been used. diff --git a/external/rocksdb/table/cuckoo_table_reader_test.cc b/external/rocksdb/table/cuckoo_table_reader_test.cc index f10fcc5715..e440af90ab 100644 --- a/external/rocksdb/table/cuckoo_table_reader_test.cc +++ b/external/rocksdb/table/cuckoo_table_reader_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -98,8 +98,9 @@ class CuckooReaderTest : public testing::Test { unique_ptr file_writer( new WritableFileWriter(std::move(writable_file), env_options)); - CuckooTableBuilder builder(file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, - 2, false, false, GetSliceHash); + CuckooTableBuilder builder( + file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false, + GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -148,7 +149,7 @@ class CuckooReaderTest : public testing::Test { CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); - Iterator* it = reader.NewIterator(ReadOptions(), nullptr); + InternalIterator* it = reader.NewIterator(ReadOptions(), nullptr); ASSERT_OK(it->status()); ASSERT_TRUE(!it->Valid()); it->SeekToFirst(); @@ -196,7 +197,7 @@ class CuckooReaderTest : public testing::Test { ASSERT_TRUE(keys[num_items/2] == it->key()); ASSERT_TRUE(values[num_items/2] == it->value()); ASSERT_OK(it->status()); - it->~Iterator(); + it->~InternalIterator(); } std::vector keys; @@ -405,9 +406,10 @@ void WriteFile(const std::vector& keys, ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); unique_ptr file_writer( new WritableFileWriter(std::move(writable_file), env_options)); - CuckooTableBuilder builder(file_writer.get(), hash_ratio, 64, 1000, - test::Uint64Comparator(), 5, false, - FLAGS_identity_as_first_hash, nullptr); + CuckooTableBuilder builder( + file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5, + false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */, + kDefaultColumnFamilyName); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. @@ -500,7 +502,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { &get_context); } } - float time_per_op = (env->NowMicros() - start_time) * 1.0 / num; + float time_per_op = (env->NowMicros() - start_time) * 1.0f / num; fprintf(stderr, "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n", time_per_op, 1.0 / time_per_op, batch_size); diff --git a/external/rocksdb/table/filter_block.h b/external/rocksdb/table/filter_block.h index 855a231698..1fe428ec59 100644 --- a/external/rocksdb/table/filter_block.h +++ b/external/rocksdb/table/filter_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -65,7 +65,13 @@ class FilterBlockBuilder { // BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - explicit FilterBlockReader() {} + explicit FilterBlockReader() + : whole_key_filtering_(true), size_(0), statistics_(nullptr) {} + explicit FilterBlockReader(size_t s, Statistics* stats, + bool _whole_key_filtering) + : whole_key_filtering_(_whole_key_filtering), + size_(s), + statistics_(stats) {} virtual ~FilterBlockReader() {} virtual bool IsBlockBased() = 0; // If is blockbased filter @@ -74,6 +80,10 @@ class FilterBlockReader { virtual bool PrefixMayMatch(const Slice& prefix, uint64_t block_offset = kNotValid) = 0; virtual size_t ApproximateMemoryUsage() const = 0; + virtual size_t size() const { return size_; } + virtual Statistics* statistics() const { return statistics_; } + + bool whole_key_filtering() const { return whole_key_filtering_; } // convert this object to a human readable form virtual std::string ToString() const { @@ -81,10 +91,15 @@ class FilterBlockReader { return error_msg; } + protected: + bool whole_key_filtering_; + private: // No copying allowed FilterBlockReader(const FilterBlockReader&); void operator=(const FilterBlockReader&); + size_t size_; + Statistics* statistics_; }; } // namespace rocksdb diff --git a/external/rocksdb/table/flush_block_policy.cc b/external/rocksdb/table/flush_block_policy.cc index 4c12b30bb2..8fef4d9146 100644 --- a/external/rocksdb/table/flush_block_policy.cc +++ b/external/rocksdb/table/flush_block_policy.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,11 +21,11 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { // reaches the configured FlushBlockBySizePolicy(const uint64_t block_size, const uint64_t block_size_deviation, - const BlockBuilder& data_block_builder) : - block_size_(block_size), - block_size_deviation_(block_size_deviation), - data_block_builder_(data_block_builder) { - } + const BlockBuilder& data_block_builder) + : block_size_(block_size), + block_size_deviation_limit_( + ((block_size * (100 - block_size_deviation)) + 99) / 100), + data_block_builder_(data_block_builder) {} virtual bool Update(const Slice& key, const Slice& value) override { @@ -46,18 +46,20 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { private: bool BlockAlmostFull(const Slice& key, const Slice& value) const { + if (block_size_deviation_limit_ == 0) { + return false; + } + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); const auto estimated_size_after = data_block_builder_.EstimateSizeAfterKV(key, value); - return - estimated_size_after > block_size_ && - block_size_deviation_ > 0 && - curr_size * 100 > block_size_ * (100 - block_size_deviation_); + return estimated_size_after > block_size_ && + curr_size > block_size_deviation_limit_; } const uint64_t block_size_; - const uint64_t block_size_deviation_; + const uint64_t block_size_deviation_limit_; const BlockBuilder& data_block_builder_; }; diff --git a/external/rocksdb/table/format.cc b/external/rocksdb/table/format.cc index a58bbee24c..3675bbadec 100644 --- a/external/rocksdb/table/format.cc +++ b/external/rocksdb/table/format.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,6 +14,8 @@ #include "rocksdb/env.h" #include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/persistent_cache_helper.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" @@ -21,6 +23,9 @@ #include "util/perf_context_imp.h" #include "util/string_util.h" #include "util/xxhash.h" +#include "util/statistics.h" +#include "util/stop_watch.h" + namespace rocksdb { @@ -37,12 +42,16 @@ const uint64_t kPlainTableMagicNumber = 0; #endif const uint32_t DefaultStackBufferSize = 5000; +bool ShouldReportDetailedTime(Env* env, Statistics* stats) { + return env != nullptr && stats != nullptr && + stats->stats_level_ > kExceptDetailedTimers; +} + void BlockHandle::EncodeTo(std::string* dst) const { // Sanity check that all fields have been set assert(offset_ != ~static_cast(0)); assert(size_ != ~static_cast(0)); - PutVarint64(dst, offset_); - PutVarint64(dst, size_); + PutVarint64Varint64(dst, offset_, size_); } Status BlockHandle::DecodeFrom(Slice* input) { @@ -59,14 +68,7 @@ std::string BlockHandle::ToString(bool hex) const { std::string handle_str; EncodeTo(&handle_str); if (hex) { - std::string result; - char buf[10]; - for (size_t i = 0; i < handle_str.size(); i++) { - snprintf(buf, sizeof(buf), "%02X", - static_cast(handle_str[i])); - result += buf; - } - return result; + return Slice(handle_str).ToString(true); } else { return handle_str; } @@ -301,9 +303,12 @@ Status ReadBlock(RandomAccessFileReader* file, const Footer& footer, } // namespace Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - BlockContents* contents, Env* env, - bool decompression_requested) { + const ReadOptions& read_options, + const BlockHandle& handle, BlockContents* contents, + const ImmutableCFOptions &ioptions, + bool decompression_requested, + const Slice& compression_dict, + const PersistentCacheOptions& cache_options) { Status status; Slice slice; size_t n = static_cast(handle.size()); @@ -312,17 +317,63 @@ Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer, char* used_buf = nullptr; rocksdb::CompressionType compression_type; - if (decompression_requested && - n + kBlockTrailerSize < DefaultStackBufferSize) { - // If we've got a small enough hunk of data, read it in to the - // trivially allocated stack buffer instead of needing a full malloc() - used_buf = &stack_buf[0]; + if (cache_options.persistent_cache && + !cache_options.persistent_cache->IsCompressed()) { + status = PersistentCacheHelper::LookupUncompressedPage(cache_options, + handle, contents); + if (status.ok()) { + // uncompressed page is found for the block handle + return status; + } else { + // uncompressed page is not found + if (ioptions.info_log && !status.IsNotFound()) { + assert(!status.ok()); + Log(InfoLogLevel::INFO_LEVEL, ioptions.info_log, + "Error reading from persistent cache. %s", + status.ToString().c_str()); + } + } + } + + if (cache_options.persistent_cache && + cache_options.persistent_cache->IsCompressed()) { + // lookup uncompressed cache mode p-cache + status = PersistentCacheHelper::LookupRawPage( + cache_options, handle, &heap_buf, n + kBlockTrailerSize); } else { - heap_buf = std::unique_ptr(new char[n + kBlockTrailerSize]); - used_buf = heap_buf.get(); + status = Status::NotFound(); } - status = ReadBlock(file, footer, options, handle, &slice, used_buf); + if (status.ok()) { + // cache hit + used_buf = heap_buf.get(); + slice = Slice(heap_buf.get(), n); + } else { + if (ioptions.info_log && !status.IsNotFound()) { + assert(!status.ok()); + Log(InfoLogLevel::INFO_LEVEL, ioptions.info_log, + "Error reading from persistent cache. %s", status.ToString().c_str()); + } + // cache miss read from device + if (decompression_requested && + n + kBlockTrailerSize < DefaultStackBufferSize) { + // If we've got a small enough hunk of data, read it in to the + // trivially allocated stack buffer instead of needing a full malloc() + used_buf = &stack_buf[0]; + } else { + heap_buf = std::unique_ptr(new char[n + kBlockTrailerSize]); + used_buf = heap_buf.get(); + } + + status = ReadBlock(file, footer, read_options, handle, &slice, used_buf); + if (status.ok() && read_options.fill_cache && + cache_options.persistent_cache && + cache_options.persistent_cache->IsCompressed()) { + // insert to raw cache + PersistentCacheHelper::InsertRawPage(cache_options, handle, used_buf, + n + kBlockTrailerSize); + } + } if (!status.ok()) { return status; @@ -333,37 +384,45 @@ Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer, compression_type = static_cast(slice.data()[n]); if (decompression_requested && compression_type != kNoCompression) { - return UncompressBlockContents(slice.data(), n, contents, footer.version()); - } - - if (slice.data() != used_buf) { + // compressed page, uncompress, update cache + status = UncompressBlockContents(slice.data(), n, contents, + footer.version(), compression_dict, + ioptions); + } else if (slice.data() != used_buf) { + // the slice content is not the buffer provided *contents = BlockContents(Slice(slice.data(), n), false, compression_type); - return status; + } else { + // page is uncompressed, the buffer either stack or heap provided + if (used_buf == &stack_buf[0]) { + heap_buf = std::unique_ptr(new char[n]); + memcpy(heap_buf.get(), stack_buf, n); + } + *contents = BlockContents(std::move(heap_buf), n, true, compression_type); } - if (used_buf == &stack_buf[0]) { - heap_buf = std::unique_ptr(new char[n]); - memcpy(heap_buf.get(), stack_buf, n); + if (status.ok() && read_options.fill_cache && + cache_options.persistent_cache && + !cache_options.persistent_cache->IsCompressed()) { + // insert to uncompressed cache + PersistentCacheHelper::InsertUncompressedPage(cache_options, handle, + *contents); } - *contents = BlockContents(std::move(heap_buf), n, true, compression_type); return status; } -// -// The 'data' points to the raw block contents that was read in from file. -// This method allocates a new heap buffer and the raw block -// contents are uncompresed into this buffer. This -// buffer is returned via 'result' and it is upto the caller to -// free this buffer. -// format_version is the block format as defined in include/rocksdb/table.h -Status UncompressBlockContents(const char* data, size_t n, - BlockContents* contents, - uint32_t format_version) { +Status UncompressBlockContentsForCompressionType( + const char* data, size_t n, BlockContents* contents, + uint32_t format_version, const Slice& compression_dict, + CompressionType compression_type, const ImmutableCFOptions &ioptions) { std::unique_ptr ubuf; + + assert(compression_type != kNoCompression && "Invalid compression type"); + + StopWatchNano timer(ioptions.env, + ShouldReportDetailedTime(ioptions.env, ioptions.statistics)); int decompress_size = 0; - assert(data[n] != kNoCompression); - switch (data[n]) { + switch (compression_type) { case kSnappyCompression: { size_t ulength = 0; static char snappy_corrupt_msg[] = @@ -371,7 +430,7 @@ Status UncompressBlockContents(const char* data, size_t n, if (!Snappy_GetUncompressedLength(data, n, &ulength)) { return Status::Corruption(snappy_corrupt_msg); } - ubuf = std::unique_ptr(new char[ulength]); + ubuf.reset(new char[ulength]); if (!Snappy_Uncompress(data, n, ubuf.get())) { return Status::Corruption(snappy_corrupt_msg); } @@ -379,9 +438,10 @@ Status UncompressBlockContents(const char* data, size_t n, break; } case kZlibCompression: - ubuf = std::unique_ptr(Zlib_Uncompress( + ubuf.reset(Zlib_Uncompress( data, n, &decompress_size, - GetCompressFormatForVersion(kZlibCompression, format_version))); + GetCompressFormatForVersion(kZlibCompression, format_version), + compression_dict)); if (!ubuf) { static char zlib_corrupt_msg[] = "Zlib not supported or corrupted Zlib compressed block contents"; @@ -391,7 +451,7 @@ Status UncompressBlockContents(const char* data, size_t n, BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kBZip2Compression: - ubuf = std::unique_ptr(BZip2_Uncompress( + ubuf.reset(BZip2_Uncompress( data, n, &decompress_size, GetCompressFormatForVersion(kBZip2Compression, format_version))); if (!ubuf) { @@ -403,9 +463,10 @@ Status UncompressBlockContents(const char* data, size_t n, BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kLZ4Compression: - ubuf = std::unique_ptr(LZ4_Uncompress( + ubuf.reset(LZ4_Uncompress( data, n, &decompress_size, - GetCompressFormatForVersion(kLZ4Compression, format_version))); + GetCompressFormatForVersion(kLZ4Compression, format_version), + compression_dict)); if (!ubuf) { static char lz4_corrupt_msg[] = "LZ4 not supported or corrupted LZ4 compressed block contents"; @@ -415,9 +476,10 @@ Status UncompressBlockContents(const char* data, size_t n, BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kLZ4HCCompression: - ubuf = std::unique_ptr(LZ4_Uncompress( + ubuf.reset(LZ4_Uncompress( data, n, &decompress_size, - GetCompressFormatForVersion(kLZ4HCCompression, format_version))); + GetCompressFormatForVersion(kLZ4HCCompression, format_version), + compression_dict)); if (!ubuf) { static char lz4hc_corrupt_msg[] = "LZ4HC not supported or corrupted LZ4HC compressed block contents"; @@ -426,9 +488,18 @@ Status UncompressBlockContents(const char* data, size_t n, *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; + case kXpressCompression: + ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size)); + if (!ubuf) { + static char xpress_corrupt_msg[] = + "XPRESS not supported or corrupted XPRESS compressed block contents"; + return Status::Corruption(xpress_corrupt_msg); + } + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); + break; case kZSTDNotFinalCompression: - ubuf = - std::unique_ptr(ZSTD_Uncompress(data, n, &decompress_size)); + ubuf.reset(ZSTD_Uncompress(data, n, &decompress_size, compression_dict)); if (!ubuf) { static char zstd_corrupt_msg[] = "ZSTD not supported or corrupted ZSTD compressed block contents"; @@ -440,7 +511,32 @@ Status UncompressBlockContents(const char* data, size_t n, default: return Status::Corruption("bad block type"); } + + if(ShouldReportDetailedTime(ioptions.env, ioptions.statistics)){ + MeasureTime(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + MeasureTime(ioptions.statistics, BYTES_DECOMPRESSED, contents->data.size()); + RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); + } + return Status::OK(); } +// +// The 'data' points to the raw block contents that was read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This +// buffer is returned via 'result' and it is upto the caller to +// free this buffer. +// format_version is the block format as defined in include/rocksdb/table.h +Status UncompressBlockContents(const char* data, size_t n, + BlockContents* contents, uint32_t format_version, + const Slice& compression_dict, + const ImmutableCFOptions &ioptions) { + assert(data[n] != kNoCompression); + return UncompressBlockContentsForCompressionType( + data, n, contents, format_version, compression_dict, + (CompressionType)data[n], ioptions); +} + } // namespace rocksdb diff --git a/external/rocksdb/table/format.h b/external/rocksdb/table/format.h index 74ec808c6f..571659d596 100644 --- a/external/rocksdb/table/format.h +++ b/external/rocksdb/table/format.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,12 +15,17 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" +#include "port/port.h" // noexcept +#include "table/persistent_cache_helper.h" + namespace rocksdb { class Block; class RandomAccessFile; struct ReadOptions; +extern bool ShouldReportDetailedTime(Env* env, Statistics* stats); + // the length of the magic number in bytes. const int kMagicNumberLengthByte = 8; @@ -69,6 +74,7 @@ inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, uint32_t version) { // snappy is not versioned assert(compression_type != kSnappyCompression && + compression_type != kXpressCompression && compression_type != kNoCompression); // As of version 2, we encode compressed block with // compress_format_version == 2. Before that, the version is 1. @@ -192,7 +198,7 @@ struct BlockContents { compression_type(_compression_type), allocation(std::move(_data)) {} - BlockContents(BlockContents&& other) { *this = std::move(other); } + BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT { *this = std::move(other); } BlockContents& operator=(BlockContents&& other) { data = std::move(other.data); @@ -205,12 +211,12 @@ struct BlockContents { // Read the block identified by "handle" from "file". On failure // return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents(RandomAccessFileReader* file, - const Footer& footer, - const ReadOptions& options, - const BlockHandle& handle, - BlockContents* contents, Env* env, - bool do_uncompress); +extern Status ReadBlockContents( + RandomAccessFileReader* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + BlockContents* contents, const ImmutableCFOptions &ioptions, + bool do_uncompress = true, const Slice& compression_dict = Slice(), + const PersistentCacheOptions& cache_options = PersistentCacheOptions()); // The 'data' points to the raw block contents read in from file. // This method allocates a new heap buffer and the raw block @@ -221,7 +227,17 @@ extern Status ReadBlockContents(RandomAccessFileReader* file, // util/compression.h extern Status UncompressBlockContents(const char* data, size_t n, BlockContents* contents, - uint32_t compress_format_version); + uint32_t compress_format_version, + const Slice& compression_dict, + const ImmutableCFOptions &ioptions); + +// This is an extension to UncompressBlockContents that accepts +// a specific compression type. This is used by un-wrapped blocks +// with no compression header. +extern Status UncompressBlockContentsForCompressionType( + const char* data, size_t n, BlockContents* contents, + uint32_t compress_format_version, const Slice& compression_dict, + CompressionType compression_type, const ImmutableCFOptions &ioptions); // Implementation details follow. Clients should ignore, diff --git a/external/rocksdb/table/full_filter_block.cc b/external/rocksdb/table/full_filter_block.cc index 3744d417f9..1c89cc1c4f 100644 --- a/external/rocksdb/table/full_filter_block.cc +++ b/external/rocksdb/table/full_filter_block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -53,20 +53,22 @@ Slice FullFilterBlockBuilder::Finish() { } FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - const Slice& contents, FilterBitsReader* filter_bits_reader) - : prefix_extractor_(prefix_extractor), - whole_key_filtering_(whole_key_filtering), + const SliceTransform* prefix_extractor, bool _whole_key_filtering, + const Slice& contents, FilterBitsReader* filter_bits_reader, + Statistics* stats) + : FilterBlockReader(contents.size(), stats, _whole_key_filtering), + prefix_extractor_(prefix_extractor), contents_(contents) { assert(filter_bits_reader != nullptr); filter_bits_reader_.reset(filter_bits_reader); } FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader) - : FullFilterBlockReader(prefix_extractor, whole_key_filtering, - contents.data, filter_bits_reader) { + const SliceTransform* prefix_extractor, bool _whole_key_filtering, + BlockContents&& contents, FilterBitsReader* filter_bits_reader, + Statistics* stats) + : FullFilterBlockReader(prefix_extractor, _whole_key_filtering, + contents.data, filter_bits_reader, stats) { block_contents_ = std::move(contents); } diff --git a/external/rocksdb/table/full_filter_block.h b/external/rocksdb/table/full_filter_block.h index 1ecc07a01f..4aa357f8a8 100644 --- a/external/rocksdb/table/full_filter_block.h +++ b/external/rocksdb/table/full_filter_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -75,11 +75,13 @@ class FullFilterBlockReader : public FilterBlockReader { explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, bool whole_key_filtering, const Slice& contents, - FilterBitsReader* filter_bits_reader); + FilterBitsReader* filter_bits_reader, + Statistics* statistics); explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, bool whole_key_filtering, BlockContents&& contents, - FilterBitsReader* filter_bits_reader); + FilterBitsReader* filter_bits_reader, + Statistics* statistics); // bits_reader is created in filter_policy, it should be passed in here // directly. and be deleted here @@ -94,7 +96,6 @@ class FullFilterBlockReader : public FilterBlockReader { private: const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; std::unique_ptr filter_bits_reader_; Slice contents_; diff --git a/external/rocksdb/table/full_filter_block_test.cc b/external/rocksdb/table/full_filter_block_test.cc index 0275a6ca69..51ce1aaa99 100644 --- a/external/rocksdb/table/full_filter_block_test.cc +++ b/external/rocksdb/table/full_filter_block_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -110,7 +110,7 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { FullFilterBlockReader reader( nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block)); + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); // Remain same symantic with blockbased filter ASSERT_TRUE(reader.KeyMayMatch("foo")); } @@ -126,7 +126,7 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { Slice block = builder.Finish(); FullFilterBlockReader reader( nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block)); + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); ASSERT_TRUE(reader.KeyMayMatch("foo")); ASSERT_TRUE(reader.KeyMayMatch("bar")); ASSERT_TRUE(reader.KeyMayMatch("box")); @@ -155,7 +155,7 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) { FullFilterBlockReader reader( nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block)); + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); // Remain same symantic with blockbased filter ASSERT_TRUE(reader.KeyMayMatch("foo")); } @@ -171,7 +171,7 @@ TEST_F(FullFilterBlockTest, SingleChunk) { Slice block = builder.Finish(); FullFilterBlockReader reader( nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block)); + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); ASSERT_TRUE(reader.KeyMayMatch("foo")); ASSERT_TRUE(reader.KeyMayMatch("bar")); ASSERT_TRUE(reader.KeyMayMatch("box")); diff --git a/external/rocksdb/table/get_context.cc b/external/rocksdb/table/get_context.cc index 609ca30831..4a7a9693b7 100644 --- a/external/rocksdb/table/get_context.cc +++ b/external/rocksdb/table/get_context.cc @@ -1,9 +1,11 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #include "table/get_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/statistics.h" @@ -34,7 +36,9 @@ GetContext::GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, std::string* ret_value, - bool* value_found, MergeContext* merge_context, Env* env) + bool* value_found, MergeContext* merge_context, Env* env, + SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr) : ucmp_(ucmp), merge_operator_(merge_operator), logger_(logger), @@ -45,7 +49,13 @@ GetContext::GetContext(const Comparator* ucmp, value_found_(value_found), merge_context_(merge_context), env_(env), - replay_log_(nullptr) {} + seq_(seq), + replay_log_(nullptr), + pinned_iters_mgr_(_pinned_iters_mgr) { + if (seq_) { + *seq_ = kMaxSequenceNumber; + } +} // Called from TableCache::Get and Table::Get when file/block in which // key may exist are not there in TableCache/BlockCache respectively. In this @@ -59,44 +69,50 @@ void GetContext::MarkKeyMayExist() { } } -void GetContext::SaveValue(const Slice& value) { +void GetContext::SaveValue(const Slice& value, SequenceNumber seq) { assert(state_ == kNotFound); appendToReplayLog(replay_log_, kTypeValue, value); state_ = kFound; - value_->assign(value.data(), value.size()); + if (value_ != nullptr) { + value_->assign(value.data(), value.size()); + } } bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, - const Slice& value) { + const Slice& value, bool value_pinned) { assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); if (ucmp_->Equal(parsed_key.user_key, user_key_)) { appendToReplayLog(replay_log_, parsed_key.type, value); + if (seq_ != nullptr) { + // Set the sequence number if it is uninitialized + if (*seq_ == kMaxSequenceNumber) { + *seq_ = parsed_key.sequence; + } + } + // Key matches. Process it switch (parsed_key.type) { case kTypeValue: assert(state_ == kNotFound || state_ == kMerge); if (kNotFound == state_) { state_ = kFound; - value_->assign(value.data(), value.size()); + if (value_ != nullptr) { + value_->assign(value.data(), value.size()); + } } else if (kMerge == state_) { assert(merge_operator_ != nullptr); state_ = kFound; - bool merge_success = false; - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - merge_success = merge_operator_->FullMerge( - user_key_, &value, merge_context_->GetOperands(), value_, - logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanosSafe()); - } - if (!merge_success) { - RecordTick(statistics_, NUMBER_MERGE_FAILURES); - state_ = kCorrupt; + if (value_ != nullptr) { + Status merge_status = + MergeHelper::TimedFullMerge(merge_operator_, user_key_, &value, + merge_context_->GetOperands(), + value_, logger_, statistics_, env_); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } } return false; @@ -110,19 +126,15 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kDeleted; } else if (kMerge == state_) { state_ = kFound; - bool merge_success = false; - { - StopWatchNano timer(env_, statistics_ != nullptr); - PERF_TIMER_GUARD(merge_operator_time_nanos); - merge_success = merge_operator_->FullMerge( - user_key_, nullptr, merge_context_->GetOperands(), value_, - logger_); - RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, - timer.ElapsedNanosSafe()); - } - if (!merge_success) { - RecordTick(statistics_, NUMBER_MERGE_FAILURES); - state_ = kCorrupt; + if (value_ != nullptr) { + Status merge_status = + MergeHelper::TimedFullMerge(merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), + value_, logger_, statistics_, env_); + + if (!merge_status.ok()) { + state_ = kCorrupt; + } } } return false; @@ -130,7 +142,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, case kTypeMerge: assert(state_ == kNotFound || state_ == kMerge); state_ = kMerge; - merge_context_->PushOperand(value); + merge_context_->PushOperand(value, value_pinned); return true; default: @@ -154,8 +166,11 @@ void replayGetContextLog(const Slice& replay_log, const Slice& user_key, bool ret = GetLengthPrefixedSlice(&s, &value); assert(ret); (void)ret; - // Sequence number is ignored in SaveValue, so we just pass 0. - get_context->SaveValue(ParsedInternalKey(user_key, 0, type), value); + + // Since SequenceNumber is not stored and unknown, we will use + // kMaxSequenceNumber. + get_context->SaveValue( + ParsedInternalKey(user_key, kMaxSequenceNumber, type), value, true); } #else // ROCKSDB_LITE assert(false); diff --git a/external/rocksdb/table/get_context.h b/external/rocksdb/table/get_context.h index 2c2dd8e1d8..4cee09a8d5 100644 --- a/external/rocksdb/table/get_context.h +++ b/external/rocksdb/table/get_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,9 +7,11 @@ #include #include "db/merge_context.h" #include "rocksdb/env.h" +#include "rocksdb/types.h" namespace rocksdb { class MergeContext; +class PinnedIteratorsManager; class GetContext { public: @@ -24,18 +26,36 @@ class GetContext { GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, std::string* ret_value, bool* value_found, - MergeContext* merge_context, Env* env_); + MergeContext* merge_context, Env* env, + SequenceNumber* seq = nullptr, + PinnedIteratorsManager* _pinned_iters_mgr = nullptr); void MarkKeyMayExist(); - void SaveValue(const Slice& value); - bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value); + + // Records this key, value, and any meta-data (such as sequence number and + // state) into this GetContext. + // + // Returns True if more keys need to be read (due to merges) or + // False if the complete value has been found. + bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value, + bool value_pinned = false); + + // Simplified version of the previous function. Should only be used when we + // know that the operation is a Put. + void SaveValue(const Slice& value, SequenceNumber seq); + GetState State() const { return state_; } + PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; } + // If a non-null string is passed, all the SaveValue calls will be // logged into the string. The operations can then be replayed on // another GetContext with replayGetContextLog. void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; } + // Do we need to fetch the SequenceNumber for this key? + bool NeedToReadSequence() const { return (seq_ != nullptr); } + private: const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -49,7 +69,12 @@ class GetContext { bool* value_found_; // Is value set correctly? Used by KeyMayExist MergeContext* merge_context_; Env* env_; + // If a key is found, seq_ will be set to the SequenceNumber of most recent + // write to the key or kMaxSequenceNumber if unknown + SequenceNumber* seq_; std::string* replay_log_; + // Used to temporarily pin blocks when state_ == GetContext::kMerge + PinnedIteratorsManager* pinned_iters_mgr_; }; void replayGetContextLog(const Slice& replay_log, const Slice& user_key, diff --git a/external/rocksdb/table/internal_iterator.h b/external/rocksdb/table/internal_iterator.h new file mode 100644 index 0000000000..f1f1e0bffc --- /dev/null +++ b/external/rocksdb/table/internal_iterator.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once + +#include +#include "rocksdb/iterator.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class PinnedIteratorsManager; + +class InternalIterator : public Cleanable { + public: + InternalIterator() {} + virtual ~InternalIterator() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont + // communicate with PinnedIteratorsManager so default implementation is no-op + // but for Iterators that need to communicate with PinnedIteratorsManager + // they will implement this function and use the passed pointer to communicate + // with PinnedIteratorsManager. + virtual void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {} + + // If true, this means that the Slice returned by key() is valid as long as + // PinnedIteratorsManager::ReleasePinnedIterators is not called and the + // Iterator is not deleted. + // + // IsKeyPinned() is guaranteed to always return true if + // - Iterator is created with ReadOptions::pin_data = true + // - DB tables were created with BlockBasedTableOptions::use_delta_encoding + // set to false. + virtual bool IsKeyPinned() const { return false; } + + // If true, this means that the Slice returned by value() is valid as long as + // PinnedIteratorsManager::ReleasePinnedIterators is not called and the + // Iterator is not deleted. + virtual bool IsValuePinned() const { return false; } + + virtual Status GetProperty(std::string prop_name, std::string* prop) { + return Status::NotSupported(""); + } + + private: + // No copying allowed + InternalIterator(const InternalIterator&) = delete; + InternalIterator& operator=(const InternalIterator&) = delete; +}; + +// Return an empty iterator (yields nothing). +extern InternalIterator* NewEmptyInternalIterator(); + +// Return an empty iterator with the specified status. +extern InternalIterator* NewErrorInternalIterator(const Status& status); + +} // namespace rocksdb diff --git a/external/rocksdb/table/iter_heap.h b/external/rocksdb/table/iter_heap.h index 5343175c3b..642383345c 100644 --- a/external/rocksdb/table/iter_heap.h +++ b/external/rocksdb/table/iter_heap.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/table/iterator.cc b/external/rocksdb/table/iterator.cc index f97879aea0..09f7f8e687 100644 --- a/external/rocksdb/table/iterator.cc +++ b/external/rocksdb/table/iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,17 +8,18 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/iterator.h" +#include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "util/arena.h" namespace rocksdb { -Iterator::Iterator() { +Cleanable::Cleanable() { cleanup_.function = nullptr; cleanup_.next = nullptr; } -Iterator::~Iterator() { +Cleanable::~Cleanable() { if (cleanup_.function != nullptr) { (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); for (Cleanup* c = cleanup_.next; c != nullptr; ) { @@ -30,7 +31,7 @@ Iterator::~Iterator() { } } -void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { +void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { assert(func != nullptr); Cleanup* c; if (cleanup_.function == nullptr) { @@ -45,6 +46,17 @@ void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { c->arg2 = arg2; } +Status Iterator::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.is-key-pinned") { + *prop = "0"; + return Status::OK(); + } + return Status::InvalidArgument("Undentified property."); +} + namespace { class EmptyIterator : public Iterator { public: @@ -68,31 +80,62 @@ class EmptyIterator : public Iterator { private: Status status_; }; + +class EmptyInternalIterator : public InternalIterator { + public: + explicit EmptyInternalIterator(const Status& s) : status_(s) {} + virtual bool Valid() const override { return false; } + virtual void Seek(const Slice& target) override {} + virtual void SeekToFirst() override {} + virtual void SeekToLast() override {} + virtual void Next() override { assert(false); } + virtual void Prev() override { assert(false); } + Slice key() const override { + assert(false); + return Slice(); + } + Slice value() const override { + assert(false); + return Slice(); + } + virtual Status status() const override { return status_; } + + private: + Status status_; +}; } // namespace Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); } -Iterator* NewEmptyIterator(Arena* arena) { +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +InternalIterator* NewEmptyInternalIterator() { + return new EmptyInternalIterator(Status::OK()); +} + +InternalIterator* NewEmptyInternalIterator(Arena* arena) { if (arena == nullptr) { - return NewEmptyIterator(); + return NewEmptyInternalIterator(); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyIterator(Status::OK()); + return new (mem) EmptyInternalIterator(Status::OK()); } } -Iterator* NewErrorIterator(const Status& status) { - return new EmptyIterator(status); +InternalIterator* NewErrorInternalIterator(const Status& status) { + return new EmptyInternalIterator(status); } -Iterator* NewErrorIterator(const Status& status, Arena* arena) { +InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) { if (arena == nullptr) { - return NewErrorIterator(status); + return NewErrorInternalIterator(status); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyIterator(status); + return new (mem) EmptyInternalIterator(status); } } diff --git a/external/rocksdb/table/iterator_wrapper.h b/external/rocksdb/table/iterator_wrapper.h index d64047bea9..e68bbf3f05 100644 --- a/external/rocksdb/table/iterator_wrapper.h +++ b/external/rocksdb/table/iterator_wrapper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,38 +9,46 @@ #pragma once -#include "rocksdb/iterator.h" +#include + +#include "table/internal_iterator.h" namespace rocksdb { -// A internal wrapper class with an interface similar to Iterator that -// caches the valid() and key() results for an underlying iterator. +// A internal wrapper class with an interface similar to Iterator that caches +// the valid() and key() results for an underlying iterator. // This can help avoid virtual function calls and also gives better // cache locality. class IteratorWrapper { public: - IteratorWrapper(): iter_(nullptr), valid_(false) { } - explicit IteratorWrapper(Iterator* _iter) : iter_(nullptr) { Set(_iter); } + IteratorWrapper() : iter_(nullptr), valid_(false) {} + explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) { + Set(_iter); + } ~IteratorWrapper() {} - Iterator* iter() const { return iter_; } + InternalIterator* iter() const { return iter_; } + + // Set the underlying Iterator to _iter and return + // previous underlying Iterator. + InternalIterator* Set(InternalIterator* _iter) { + InternalIterator* old_iter = iter_; - // Takes ownership of "iter" and will delete it when destroyed, or - // when Set() is invoked again. - void Set(Iterator* _iter) { - delete iter_; iter_ = _iter; if (iter_ == nullptr) { valid_ = false; } else { Update(); } + return old_iter; } void DeleteIter(bool is_arena_mode) { - if (!is_arena_mode) { - delete iter_; - } else { - iter_->~Iterator(); + if (iter_) { + if (!is_arena_mode) { + delete iter_; + } else { + iter_->~InternalIterator(); + } } } @@ -56,6 +64,19 @@ class IteratorWrapper { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { + assert(iter_); + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + bool IsKeyPinned() const { + assert(Valid()); + return iter_->IsKeyPinned(); + } + bool IsValuePinned() const { + assert(Valid()); + return iter_->IsValuePinned(); + } + private: void Update() { valid_ = iter_->Valid(); @@ -64,16 +85,17 @@ class IteratorWrapper { } } - Iterator* iter_; + InternalIterator* iter_; bool valid_; Slice key_; }; class Arena; // Return an empty iterator (yields nothing) allocated from arena. -extern Iterator* NewEmptyIterator(Arena* arena); +extern InternalIterator* NewEmptyInternalIterator(Arena* arena); // Return an empty iterator with the specified status, allocated arena. -extern Iterator* NewErrorIterator(const Status& status, Arena* arena); +extern InternalIterator* NewErrorInternalIterator(const Status& status, + Arena* arena); } // namespace rocksdb diff --git a/external/rocksdb/table/merger.cc b/external/rocksdb/table/merger.cc index 242587ea86..637959d9ac 100644 --- a/external/rocksdb/table/merger.cc +++ b/external/rocksdb/table/merger.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,17 +11,19 @@ #include +#include "db/pinned_iterators_manager.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" +#include "table/internal_iterator.h" #include "table/iter_heap.h" #include "table/iterator_wrapper.h" #include "util/arena.h" +#include "util/autovector.h" #include "util/heap.h" +#include "util/perf_context_imp.h" #include "util/stop_watch.h" #include "util/sync_point.h" -#include "util/perf_context_imp.h" -#include "util/autovector.h" namespace rocksdb { // Without anonymous namespace here, we fail the warning -Wmissing-prototypes @@ -32,15 +34,16 @@ typedef BinaryHeap MergerMinIterHeap; const size_t kNumIterReserve = 4; -class MergingIterator : public Iterator { +class MergingIterator : public InternalIterator { public: - MergingIterator(const Comparator* comparator, Iterator** children, int n, - bool is_arena_mode) + MergingIterator(const Comparator* comparator, InternalIterator** children, + int n, bool is_arena_mode) : is_arena_mode_(is_arena_mode), comparator_(comparator), current_(nullptr), direction_(kForward), - minHeap_(comparator_) { + minHeap_(comparator_), + pinned_iters_mgr_(nullptr) { children_.resize(n); for (int i = 0; i < n; i++) { children_[i].Set(children[i]); @@ -53,9 +56,12 @@ class MergingIterator : public Iterator { current_ = CurrentForward(); } - virtual void AddIterator(Iterator* iter) { + virtual void AddIterator(InternalIterator* iter) { assert(direction_ == kForward); children_.emplace_back(iter); + if (pinned_iters_mgr_) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } auto new_wrapper = children_.back(); if (new_wrapper.Valid()) { minHeap_.push(&new_wrapper); @@ -237,6 +243,26 @@ class MergingIterator : public Iterator { return s; } + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + for (auto& child : children_) { + child.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + virtual bool IsKeyPinned() const override { + assert(Valid()); + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsKeyPinned(); + } + + virtual bool IsValuePinned() const override { + assert(Valid()); + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsValuePinned(); + } + private: // Clears heaps for both directions, used when changing direction or seeking void ClearHeaps(); @@ -262,6 +288,7 @@ class MergingIterator : public Iterator { // Max heap is used for reverse iteration, which is way less common than // forward. Lazily initialize it to save memory. std::unique_ptr maxHeap_; + PinnedIteratorsManager* pinned_iters_mgr_; IteratorWrapper* CurrentForward() const { assert(direction_ == kForward); @@ -288,11 +315,12 @@ void MergingIterator::InitMaxHeap() { } } -Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n, - Arena* arena) { +InternalIterator* NewMergingIterator(const Comparator* cmp, + InternalIterator** list, int n, + Arena* arena) { assert(n >= 0); if (n == 0) { - return NewEmptyIterator(arena); + return NewEmptyInternalIterator(arena); } else if (n == 1) { return list[0]; } else { @@ -313,7 +341,7 @@ MergeIteratorBuilder::MergeIteratorBuilder(const Comparator* comparator, merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true); } -void MergeIteratorBuilder::AddIterator(Iterator* iter) { +void MergeIteratorBuilder::AddIterator(InternalIterator* iter) { if (!use_merging_iter && first_iter != nullptr) { merge_iter->AddIterator(first_iter); use_merging_iter = true; @@ -325,7 +353,7 @@ void MergeIteratorBuilder::AddIterator(Iterator* iter) { } } -Iterator* MergeIteratorBuilder::Finish() { +InternalIterator* MergeIteratorBuilder::Finish() { if (!use_merging_iter) { return first_iter; } else { diff --git a/external/rocksdb/table/merger.h b/external/rocksdb/table/merger.h index 7dcf2afe78..7291a03782 100644 --- a/external/rocksdb/table/merger.h +++ b/external/rocksdb/table/merger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,7 +14,7 @@ namespace rocksdb { class Comparator; -class Iterator; +class InternalIterator; class Env; class Arena; @@ -26,9 +26,9 @@ class Arena; // key is present in K child iterators, it will be yielded K times. // // REQUIRES: n >= 0 -extern Iterator* NewMergingIterator(const Comparator* comparator, - Iterator** children, int n, - Arena* arena = nullptr); +extern InternalIterator* NewMergingIterator(const Comparator* comparator, + InternalIterator** children, int n, + Arena* arena = nullptr); class MergingIterator; @@ -41,18 +41,18 @@ class MergeIteratorBuilder { ~MergeIteratorBuilder() {} // Add iter to the merging iterator. - void AddIterator(Iterator* iter); + void AddIterator(InternalIterator* iter); // Get arena used to build the merging iterator. It is called one a child // iterator needs to be allocated. Arena* GetArena() { return arena; } // Return the result merging iterator. - Iterator* Finish(); + InternalIterator* Finish(); private: MergingIterator* merge_iter; - Iterator* first_iter; + InternalIterator* first_iter; bool use_merging_iter; Arena* arena; }; diff --git a/external/rocksdb/table/merger_test.cc b/external/rocksdb/table/merger_test.cc index 562c0ae85c..97979af7ce 100644 --- a/external/rocksdb/table/merger_test.cc +++ b/external/rocksdb/table/merger_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -88,7 +88,7 @@ class MergerTest : public testing::Test { void Generate(size_t num_iterators, size_t strings_per_iterator, int letters_per_string) { - std::vector small_iterators; + std::vector small_iterators; for (size_t i = 0; i < num_iterators; ++i) { auto strings = GenerateStrings(strings_per_iterator, letters_per_string); small_iterators.push_back(new test::VectorIterator(strings)); @@ -102,8 +102,8 @@ class MergerTest : public testing::Test { } Random rnd_; - std::unique_ptr merging_iterator_; - std::unique_ptr single_iterator_; + std::unique_ptr merging_iterator_; + std::unique_ptr single_iterator_; std::vector all_keys_; }; diff --git a/external/rocksdb/table/meta_blocks.cc b/external/rocksdb/table/meta_blocks.cc index 7bcdf7576e..e98a638e04 100644 --- a/external/rocksdb/table/meta_blocks.cc +++ b/external/rocksdb/table/meta_blocks.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,8 @@ #include "rocksdb/table_properties.h" #include "table/block.h" #include "table/format.h" +#include "table/internal_iterator.h" +#include "table/persistent_cache_helper.h" #include "table/table_properties_internal.h" #include "util/coding.h" @@ -68,10 +70,28 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kFilterSize, props.filter_size); Add(TablePropertiesNames::kFormatVersion, props.format_version); Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); + Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id); if (!props.filter_policy_name.empty()) { - Add(TablePropertiesNames::kFilterPolicy, - props.filter_policy_name); + Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name); + } + if (!props.comparator_name.empty()) { + Add(TablePropertiesNames::kComparator, props.comparator_name); + } + + if (!props.merge_operator_name.empty()) { + Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name); + } + if (!props.property_collectors_names.empty()) { + Add(TablePropertiesNames::kPropertyCollectors, + props.property_collectors_names); + } + if (!props.column_family_name.empty()) { + Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name); + } + + if (!props.compression_name.empty()) { + Add(TablePropertiesNames::kCompression, props.compression_name); } } @@ -130,7 +150,7 @@ bool NotifyCollectTableCollectorsOnFinish( } Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, - const Footer& footer, Env* env, Logger* logger, + const Footer& footer, const ImmutableCFOptions &ioptions, TableProperties** table_properties) { assert(table_properties); @@ -145,14 +165,14 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, read_options.verify_checksums = false; Status s; s = ReadBlockContents(file, footer, read_options, handle, &block_contents, - env, false); + ioptions, false /* decompress */); if (!s.ok()) { return s; } Block properties_block(std::move(block_contents)); - std::unique_ptr iter( + std::unique_ptr iter( properties_block.NewIterator(BytewiseComparator())); auto new_table_properties = new TableProperties(); @@ -170,7 +190,10 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, {TablePropertiesNames::kFormatVersion, &new_table_properties->format_version}, {TablePropertiesNames::kFixedKeyLen, - &new_table_properties->fixed_key_len}, }; + &new_table_properties->fixed_key_len}, + {TablePropertiesNames::kColumnFamilyId, + &new_table_properties->column_family_id}, + }; std::string last_key; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -196,12 +219,23 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, auto error_msg = "Detect malformed value in properties meta-block:" "\tkey: " + key + "\tval: " + raw_val.ToString(); - Log(InfoLogLevel::ERROR_LEVEL, logger, "%s", error_msg.c_str()); + Log(InfoLogLevel::ERROR_LEVEL, ioptions.info_log, "%s", + error_msg.c_str()); continue; } *(pos->second) = val; } else if (key == TablePropertiesNames::kFilterPolicy) { new_table_properties->filter_policy_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kColumnFamilyName) { + new_table_properties->column_family_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kComparator) { + new_table_properties->comparator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kMergeOperator) { + new_table_properties->merge_operator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPropertyCollectors) { + new_table_properties->property_collectors_names = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompression) { + new_table_properties->compression_name = raw_val.ToString(); } else { // handle user-collected properties new_table_properties->user_collected_properties.insert( @@ -218,8 +252,9 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, } Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, Env* env, - Logger* info_log, TableProperties** properties) { + uint64_t table_magic_number, + const ImmutableCFOptions &ioptions, + TableProperties** properties) { // -- Read metaindex block Footer footer; auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number); @@ -232,12 +267,12 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, ReadOptions read_options; read_options.verify_checksums = false; s = ReadBlockContents(file, footer, read_options, metaindex_handle, - &metaindex_contents, env, false); + &metaindex_contents, ioptions, false /* decompress */); if (!s.ok()) { return s; } Block metaindex_block(std::move(metaindex_contents)); - std::unique_ptr meta_iter( + std::unique_ptr meta_iter( metaindex_block.NewIterator(BytewiseComparator())); // -- Read property block @@ -249,8 +284,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, TableProperties table_properties; if (found_properties_block == true) { - s = ReadProperties(meta_iter->value(), file, footer, env, info_log, - properties); + s = ReadProperties(meta_iter->value(), file, footer, ioptions, properties); } else { s = Status::NotFound(); } @@ -258,7 +292,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, return s; } -Status FindMetaBlock(Iterator* meta_index_iter, +Status FindMetaBlock(InternalIterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle) { meta_index_iter->Seek(meta_block_name); @@ -272,7 +306,8 @@ Status FindMetaBlock(Iterator* meta_index_iter, } Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, Env* env, + uint64_t table_magic_number, + const ImmutableCFOptions &ioptions, const std::string& meta_block_name, BlockHandle* block_handle) { Footer footer; @@ -286,20 +321,21 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, ReadOptions read_options; read_options.verify_checksums = false; s = ReadBlockContents(file, footer, read_options, metaindex_handle, - &metaindex_contents, env, false); + &metaindex_contents, ioptions, false /* do decompression */); if (!s.ok()) { return s; } Block metaindex_block(std::move(metaindex_contents)); - std::unique_ptr meta_iter; + std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, Env* env, + uint64_t table_magic_number, + const ImmutableCFOptions &ioptions, const std::string& meta_block_name, BlockContents* contents) { Status status; @@ -315,7 +351,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, ReadOptions read_options; read_options.verify_checksums = false; status = ReadBlockContents(file, footer, read_options, metaindex_handle, - &metaindex_contents, env, false); + &metaindex_contents, ioptions, + false /* decompress */); if (!status.ok()) { return status; } @@ -323,7 +360,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, // Finding metablock Block metaindex_block(std::move(metaindex_contents)); - std::unique_ptr meta_iter; + std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); BlockHandle block_handle; @@ -335,7 +372,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, // Reading metablock return ReadBlockContents(file, footer, read_options, block_handle, contents, - env, false); + ioptions, false /* decompress */); } } // namespace rocksdb diff --git a/external/rocksdb/table/meta_blocks.h b/external/rocksdb/table/meta_blocks.h index 005bcaae2d..99084d7902 100644 --- a/external/rocksdb/table/meta_blocks.h +++ b/external/rocksdb/table/meta_blocks.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,12 +11,12 @@ #include "db/builder.h" #include "db/table_properties_collector.h" +#include "util/kv_map.h" #include "rocksdb/comparator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "table/block_builder.h" #include "table/format.h" -#include "util/stl_wrappers.h" namespace rocksdb { @@ -27,6 +27,7 @@ class Footer; class Logger; class RandomAccessFile; struct TableProperties; +class InternalIterator; class MetaIndexBuilder { public: @@ -93,7 +94,7 @@ bool NotifyCollectTableCollectorsOnFinish( // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, - const Footer& footer, Env* env, Logger* logger, + const Footer& footer, const ImmutableCFOptions &ioptions, TableProperties** table_properties); // Directly read the properties from the properties block of a plain table. @@ -101,17 +102,19 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, Env* env, - Logger* info_log, TableProperties** properties); + uint64_t table_magic_number, + const ImmutableCFOptions &ioptions, + TableProperties** properties); // Find the meta block from the meta index block. -Status FindMetaBlock(Iterator* meta_index_iter, +Status FindMetaBlock(InternalIterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle); // Find the meta block Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, Env* env, + uint64_t table_magic_number, + const ImmutableCFOptions &ioptions, const std::string& meta_block_name, BlockHandle* block_handle); @@ -119,7 +122,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, // from `file` and initialize `contents` with contents of this block. // Return Status::OK in case of success. Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, Env* env, + uint64_t table_magic_number, + const ImmutableCFOptions &ioptions, const std::string& meta_block_name, BlockContents* contents); diff --git a/external/rocksdb/table/mock_table.cc b/external/rocksdb/table/mock_table.cc index ff56d6311c..2b311d0f8b 100644 --- a/external/rocksdb/table/mock_table.cc +++ b/external/rocksdb/table/mock_table.cc @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -28,12 +28,13 @@ stl_wrappers::KVMap MakeMockFile( return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); } -Iterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena) { +InternalIterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena, + bool skip_filters) { return new MockTableIterator(table_); } Status MockTableReader::Get(const ReadOptions&, const Slice& key, - GetContext* get_context) { + GetContext* get_context, bool skip_filters) { std::unique_ptr iter(new MockTableIterator(table_)); for (iter->Seek(key); iter->Valid(); iter->Next()) { ParsedInternalKey parsed_key; @@ -58,7 +59,8 @@ MockTableFactory::MockTableFactory() : next_id_(1) {} Status MockTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const { + unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache) const { uint32_t id = GetIDFromFile(file.get()); MutexLock lock_guard(&file_system_.mutex); @@ -74,9 +76,9 @@ Status MockTableFactory::NewTableReader( } TableBuilder* MockTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, WritableFileWriter* file) const { - uint32_t id = GetAndWriteNextID(file->writable_file()); + uint32_t id = GetAndWriteNextID(file); return new MockTableBuilder(id, &file_system_); } @@ -89,12 +91,14 @@ Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, return s; } - uint32_t id = GetAndWriteNextID(file.get()); + WritableFileWriter file_writer(std::move(file), EnvOptions()); + + uint32_t id = GetAndWriteNextID(&file_writer); file_system_.files.insert({id, std::move(file_contents)}); return Status::OK(); } -uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const { +uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const { uint32_t next_id = next_id_.fetch_add(1); char buf[4]; EncodeFixed32(buf, next_id); diff --git a/external/rocksdb/table/mock_table.h b/external/rocksdb/table/mock_table.h index 322a51d1e3..d9afba46f8 100644 --- a/external/rocksdb/table/mock_table.h +++ b/external/rocksdb/table/mock_table.h @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,13 +14,14 @@ #include #include +#include "util/kv_map.h" #include "port/port.h" #include "rocksdb/comparator.h" #include "rocksdb/table.h" +#include "table/internal_iterator.h" #include "table/table_builder.h" #include "table/table_reader.h" #include "util/mutexlock.h" -#include "util/stl_wrappers.h" #include "util/testharness.h" #include "util/testutil.h" @@ -39,10 +40,11 @@ class MockTableReader : public TableReader { public: explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} - Iterator* NewIterator(const ReadOptions&, Arena* arena) override; + InternalIterator* NewIterator(const ReadOptions&, Arena* arena, + bool skip_filters = false) override; - Status Get(const ReadOptions&, const Slice& key, - GetContext* get_context) override; + Status Get(const ReadOptions&, const Slice& key, GetContext* get_context, + bool skip_filters = false) override; uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; } @@ -58,7 +60,7 @@ class MockTableReader : public TableReader { const stl_wrappers::KVMap& table_; }; -class MockTableIterator : public Iterator { +class MockTableIterator : public InternalIterator { public: explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { itr_ = table_.end(); @@ -145,13 +147,14 @@ class MockTableFactory : public TableFactory { public: MockTableFactory(); const char* Name() const override { return "MockTable"; } - Status NewTableReader(const TableReaderOptions& table_reader_options, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table_reader) const override; + Status NewTableReader( + const TableReaderOptions& table_reader_options, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFileWriter* file) const override; + uint32_t column_familly_id, WritableFileWriter* file) const override; // This function will directly create mock table instead of going through // MockTableBuilder. file_contents has to have a format of IsCompressed()); + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // insert content to cache + cache_options.persistent_cache->Insert(key, data, size); +} + +void PersistentCacheHelper::InsertUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const BlockContents& contents) { + assert(cache_options.persistent_cache); + assert(!cache_options.persistent_cache->IsCompressed()); + if (!contents.cachable || contents.compression_type != kNoCompression) { + // We shouldn't cache this. Either + // (1) content is not cacheable + // (2) content is compressed + return; + } + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // insert block contents to page cache + cache_options.persistent_cache->Insert(key, contents.data.data(), + contents.data.size()); +} + +Status PersistentCacheHelper::LookupRawPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + std::unique_ptr* raw_data, const size_t raw_data_size) { + assert(cache_options.persistent_cache); + assert(cache_options.persistent_cache->IsCompressed()); + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // Lookup page + size_t size; + Status s = cache_options.persistent_cache->Lookup(key, raw_data, &size); + if (!s.ok()) { + // cache miss + RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); + return s; + } + + // cache hit + assert(raw_data_size == handle.size() + kBlockTrailerSize); + assert(size == raw_data_size); + RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); + return Status::OK(); +} + +Status PersistentCacheHelper::LookupUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + BlockContents* contents) { + assert(cache_options.persistent_cache); + assert(!cache_options.persistent_cache->IsCompressed()); + if (!contents) { + // We shouldn't lookup in the cache. Either + // (1) Nowhere to store + return Status::NotFound(); + } + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // Lookup page + std::unique_ptr data; + size_t size; + Status s = cache_options.persistent_cache->Lookup(key, &data, &size); + if (!s.ok()) { + // cache miss + RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); + return s; + } + + // please note we are potentially comparing compressed data size with + // uncompressed data size + assert(handle.size() <= size); + + // update stats + RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); + // construct result and return + *contents = + BlockContents(std::move(data), size, false /*cacheable*/, kNoCompression); + return Status::OK(); +} + +} // namespace rocksdb diff --git a/external/rocksdb/table/persistent_cache_helper.h b/external/rocksdb/table/persistent_cache_helper.h new file mode 100644 index 0000000000..45a1f87d2a --- /dev/null +++ b/external/rocksdb/table/persistent_cache_helper.h @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include + +#include "table/block_based_table_reader.h" +#include "util/statistics.h" + +namespace rocksdb { + +struct BlockContents; + +// PersistentCacheOptions +// +// This describe the caching behavior for page cache +// This is used to pass the context for caching and the cache handle +struct PersistentCacheOptions { + PersistentCacheOptions() {} + explicit PersistentCacheOptions( + const std::shared_ptr& _persistent_cache, + const std::string _key_prefix, Statistics* const _statistics) + : persistent_cache(_persistent_cache), + key_prefix(_key_prefix), + statistics(_statistics) {} + + virtual ~PersistentCacheOptions() {} + + std::shared_ptr persistent_cache; + std::string key_prefix; + Statistics* statistics = nullptr; +}; + +// PersistentCacheHelper +// +// Encapsulates some of the helper logic for read and writing from the cache +class PersistentCacheHelper { + public: + // insert block into raw page cache + static void InsertRawPage(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, const char* data, + const size_t size); + + // insert block into uncompressed cache + static void InsertUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const BlockContents& contents); + + // lookup block from raw page cacge + static Status LookupRawPage(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, + std::unique_ptr* raw_data, + const size_t raw_data_size); + + // lookup block from uncompressed cache + static Status LookupUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + BlockContents* contents); +}; + +} // namespace rocksdb diff --git a/external/rocksdb/table/plain_table_builder.cc b/external/rocksdb/table/plain_table_builder.cc index e16224a9d3..b438ed86e2 100644 --- a/external/rocksdb/table/plain_table_builder.cc +++ b/external/rocksdb/table/plain_table_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -60,9 +60,10 @@ PlainTableBuilder::PlainTableBuilder( const ImmutableCFOptions& ioptions, const std::vector>* int_tbl_prop_collector_factories, - WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type, - size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, - size_t huge_page_tlb_size, double hash_table_ratio, + uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, const std::string& column_family_name, + uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) : ioptions_(ioptions), bloom_block_(num_probes), @@ -94,6 +95,8 @@ PlainTableBuilder::PlainTableBuilder( // To support roll-back to previous version, now still use version 0 for // plain encoding. properties_.format_version = (encoding_type == kPlain) ? 0 : 1; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; if (ioptions_.prefix_extractor) { properties_.user_collected_properties @@ -108,7 +111,7 @@ PlainTableBuilder::PlainTableBuilder( for (auto& collector_factories : *int_tbl_prop_collector_factories) { table_properties_collectors_.emplace_back( - collector_factories->CreateIntTblPropCollector()); + collector_factories->CreateIntTblPropCollector(column_family_id)); } } diff --git a/external/rocksdb/table/plain_table_builder.h b/external/rocksdb/table/plain_table_builder.h index 75ec3facdb..ba63a82f6f 100644 --- a/external/rocksdb/table/plain_table_builder.h +++ b/external/rocksdb/table/plain_table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,15 +6,16 @@ #pragma once #ifndef ROCKSDB_LITE #include +#include #include #include "rocksdb/options.h" #include "rocksdb/status.h" -#include "table/table_builder.h" -#include "table/plain_table_key_coding.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/bloom_block.h" #include "table/plain_table_index.h" +#include "table/plain_table_key_coding.h" +#include "table/table_builder.h" namespace rocksdb { @@ -34,9 +35,10 @@ class PlainTableBuilder: public TableBuilder { const ImmutableCFOptions& ioptions, const std::vector>* int_tbl_prop_collector_factories, - WritableFileWriter* file, uint32_t user_key_size, - EncodingType encoding_type, size_t index_sparseness, - uint32_t bloom_bits_per_key, uint32_t num_probes = 6, + uint32_t column_family_id, WritableFileWriter* file, + uint32_t user_key_size, EncodingType encoding_type, + size_t index_sparseness, uint32_t bloom_bits_per_key, + const std::string& column_family_name, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, bool store_index_in_file = false); diff --git a/external/rocksdb/table/plain_table_factory.cc b/external/rocksdb/table/plain_table_factory.cc index 6e86ff54fc..55a68fce17 100644 --- a/external/rocksdb/table/plain_table_factory.cc +++ b/external/rocksdb/table/plain_table_factory.cc @@ -17,16 +17,18 @@ namespace rocksdb { Status PlainTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const { + unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const { return PlainTableReader::Open( table_reader_options.ioptions, table_reader_options.env_options, table_reader_options.internal_comparator, std::move(file), file_size, - table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, - huge_page_tlb_size_, full_scan_mode_); + table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio, + table_options_.index_sparseness, table_options_.huge_page_tlb_size, + table_options_.full_scan_mode); } TableBuilder* PlainTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, WritableFileWriter* file) const { // Ignore the skip_filters flag. PlainTable format is optimized for small // in-memory dbs. The skip_filters optimization is not useful for plain @@ -34,9 +36,12 @@ TableBuilder* PlainTableFactory::NewTableBuilder( // return new PlainTableBuilder( table_builder_options.ioptions, - table_builder_options.int_tbl_prop_collector_factories, file, - user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6, - huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_); + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_options_.user_key_len, table_options_.encoding_type, + table_options_.index_sparseness, table_options_.bloom_bits_per_key, + table_builder_options.column_family_name, 6, + table_options_.huge_page_tlb_size, table_options_.hash_table_ratio, + table_options_.store_index_in_file); } std::string PlainTableFactory::GetPrintableTableOptions() const { @@ -46,32 +51,36 @@ std::string PlainTableFactory::GetPrintableTableOptions() const { char buffer[kBufferSize]; snprintf(buffer, kBufferSize, " user_key_len: %u\n", - user_key_len_); + table_options_.user_key_len); ret.append(buffer); snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n", - bloom_bits_per_key_); + table_options_.bloom_bits_per_key); ret.append(buffer); snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", - hash_table_ratio_); + table_options_.hash_table_ratio); ret.append(buffer); snprintf(buffer, kBufferSize, " index_sparseness: %" ROCKSDB_PRIszt "\n", - index_sparseness_); + table_options_.index_sparseness); ret.append(buffer); snprintf(buffer, kBufferSize, " huge_page_tlb_size: %" ROCKSDB_PRIszt "\n", - huge_page_tlb_size_); + table_options_.huge_page_tlb_size); ret.append(buffer); snprintf(buffer, kBufferSize, " encoding_type: %d\n", - encoding_type_); + table_options_.encoding_type); ret.append(buffer); snprintf(buffer, kBufferSize, " full_scan_mode: %d\n", - full_scan_mode_); + table_options_.full_scan_mode); ret.append(buffer); snprintf(buffer, kBufferSize, " store_index_in_file: %d\n", - store_index_in_file_); + table_options_.store_index_in_file); ret.append(buffer); return ret; } +const PlainTableOptions& PlainTableFactory::table_options() const { + return table_options_; +} + extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { return new PlainTableFactory(options); } diff --git a/external/rocksdb/table/plain_table_factory.h b/external/rocksdb/table/plain_table_factory.h index 539e7539df..33cd313471 100644 --- a/external/rocksdb/table/plain_table_factory.h +++ b/external/rocksdb/table/plain_table_factory.h @@ -142,28 +142,25 @@ class PlainTableFactory : public TableFactory { // huge_page_tlb_size determines whether to allocate hash indexes from huge // page TLB and the page size if allocating from there. See comments of // Arena::AllocateAligned() for details. - explicit PlainTableFactory(const PlainTableOptions& options = - PlainTableOptions()) - : user_key_len_(options.user_key_len), - bloom_bits_per_key_(options.bloom_bits_per_key), - hash_table_ratio_(options.hash_table_ratio), - index_sparseness_(options.index_sparseness), - huge_page_tlb_size_(options.huge_page_tlb_size), - encoding_type_(options.encoding_type), - full_scan_mode_(options.full_scan_mode), - store_index_in_file_(options.store_index_in_file) {} + explicit PlainTableFactory( + const PlainTableOptions& _table_options = PlainTableOptions()) + : table_options_(_table_options) {} + const char* Name() const override { return "PlainTable"; } Status NewTableReader(const TableReaderOptions& table_reader_options, unique_ptr&& file, - uint64_t file_size, - unique_ptr* table) const override; + uint64_t file_size, unique_ptr* table, + bool prefetch_index_and_filter_in_cache) const override; + TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFileWriter* file) const override; + uint32_t column_family_id, WritableFileWriter* file) const override; std::string GetPrintableTableOptions() const override; - static const char kValueTypeSeqId0 = 0xFF; + const PlainTableOptions& table_options() const; + + static const char kValueTypeSeqId0 = char(0xFF); // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, @@ -171,15 +168,10 @@ class PlainTableFactory : public TableFactory { return Status::OK(); } + void* GetOptions() override { return &table_options_; } + private: - uint32_t user_key_len_; - int bloom_bits_per_key_; - double hash_table_ratio_; - size_t index_sparseness_; - size_t huge_page_tlb_size_; - EncodingType encoding_type_; - bool full_scan_mode_; - bool store_index_in_file_; + PlainTableOptions table_options_; }; } // namespace rocksdb diff --git a/external/rocksdb/table/plain_table_index.cc b/external/rocksdb/table/plain_table_index.cc index 3e422c3c2b..c8081c006b 100644 --- a/external/rocksdb/table/plain_table_index.cc +++ b/external/rocksdb/table/plain_table_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -117,7 +117,8 @@ void PlainTableIndexBuilder::AllocateIndex() { index_size_ = 1; } else { double hash_table_size_multipier = 1.0 / hash_table_ratio_; - index_size_ = num_prefixes_ * hash_table_size_multipier + 1; + index_size_ = + static_cast(num_prefixes_ * hash_table_size_multipier) + 1; assert(index_size_ > 0); } } @@ -186,7 +187,7 @@ Slice PlainTableIndexBuilder::FillIndexes( index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask; char* prev_ptr = &sub_index[sub_index_offset]; char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); - sub_index_offset += (cur_ptr - prev_ptr); + sub_index_offset += static_cast(cur_ptr - prev_ptr); char* sub_index_pos = &sub_index[sub_index_offset]; IndexRecord* record = hash_to_offsets[i]; int j; diff --git a/external/rocksdb/table/plain_table_index.h b/external/rocksdb/table/plain_table_index.h index be8ad16392..ab2be3d1e2 100644 --- a/external/rocksdb/table/plain_table_index.h +++ b/external/rocksdb/table/plain_table_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/external/rocksdb/table/plain_table_key_coding.cc b/external/rocksdb/table/plain_table_key_coding.cc index 057c7f90ff..8442f11295 100644 --- a/external/rocksdb/table/plain_table_key_coding.cc +++ b/external/rocksdb/table/plain_table_key_coding.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -164,47 +164,62 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, return Status::OK(); } -inline bool PlainTableKeyDecoder::FileReader::Read(uint32_t file_offset, - uint32_t len, Slice* out) { - if (file_info_->is_mmap_mode) { - assert(file_offset + len <= file_info_->data_end_offset); - *out = Slice(file_info_->file_data.data() + file_offset, len); - return true; - } else { - return ReadNonMmap(file_offset, len, out); - } +Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset, + uint32_t len) { + assert(file_offset + len <= file_info_->data_end_offset); + return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset), + len); } -bool PlainTableKeyDecoder::FileReader::ReadNonMmap(uint32_t file_offset, - uint32_t len, Slice* out) { +bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len, + Slice* out) { const uint32_t kPrefetchSize = 256u; - if (file_offset < buf_start_offset_ || - file_offset + len > buf_start_offset_ + buf_len_) { - // Load buffer - assert(file_offset + len <= file_info_->data_end_offset); - uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset, - std::max(kPrefetchSize, len)); - if (size_to_read > buf_capacity_) { - buf_.reset(new char[size_to_read]); - buf_capacity_ = size_to_read; - buf_len_ = 0; - } - Slice read_result; - Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, - buf_.get()); - if (!s.ok()) { - status_ = s; - return false; + + // Try to read from buffers. + for (uint32_t i = 0; i < num_buf_; i++) { + Buffer* buffer = buffers_[num_buf_ - 1 - i].get(); + if (file_offset >= buffer->buf_start_offset && + file_offset + len <= buffer->buf_start_offset + buffer->buf_len) { + *out = GetFromBuffer(buffer, file_offset, len); + return true; } - buf_start_offset_ = file_offset; - buf_len_ = size_to_read; } - *out = Slice(buf_.get() + (file_offset - buf_start_offset_), len); + + Buffer* new_buffer; + // Data needed is not in any of the buffer. Allocate a new buffer. + if (num_buf_ < buffers_.size()) { + // Add a new buffer + new_buffer = new Buffer(); + buffers_[num_buf_++].reset(new_buffer); + } else { + // Now simply replace the last buffer. Can improve the placement policy + // if needed. + new_buffer = buffers_[num_buf_ - 1].get(); + } + + assert(file_offset + len <= file_info_->data_end_offset); + uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset, + std::max(kPrefetchSize, len)); + if (size_to_read > new_buffer->buf_capacity) { + new_buffer->buf.reset(new char[size_to_read]); + new_buffer->buf_capacity = size_to_read; + new_buffer->buf_len = 0; + } + Slice read_result; + Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, + new_buffer->buf.get()); + if (!s.ok()) { + status_ = s; + return false; + } + new_buffer->buf_start_offset = file_offset; + new_buffer->buf_len = size_to_read; + *out = GetFromBuffer(new_buffer, file_offset, len); return true; } -inline bool PlainTableKeyDecoder::FileReader::ReadVarint32( - uint32_t offset, uint32_t* out, uint32_t* bytes_read) { +inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { if (file_info_->is_mmap_mode) { const char* start = file_info_->file_data.data() + offset; const char* limit = @@ -218,8 +233,8 @@ inline bool PlainTableKeyDecoder::FileReader::ReadVarint32( } } -bool PlainTableKeyDecoder::FileReader::ReadVarint32NonMmap( - uint32_t offset, uint32_t* out, uint32_t* bytes_read) { +bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { const char* start; const char* limit; const uint32_t kMaxVarInt32Size = 6u; @@ -298,7 +313,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset, if (!s.ok()) { return s; } - if (!file_reader_.file_info_->is_mmap_mode) { + if (!file_reader_.file_info()->is_mmap_mode) { cur_key_.SetInternalKey(*parsed_key); parsed_key->user_key = Slice(cur_key_.GetKey().data(), user_key_size); if (internal_key != nullptr) { @@ -348,14 +363,14 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey( if (!s.ok()) { return s; } - if (!file_reader_.file_info_->is_mmap_mode || + if (!file_reader_.file_info()->is_mmap_mode || (internal_key != nullptr && !decoded_internal_key_valid)) { // In non-mmap mode, always need to make a copy of keys returned to // users, because after reading value for the key, the key might // be invalid. cur_key_.SetInternalKey(*parsed_key); saved_user_key_ = cur_key_.GetKey(); - if (!file_reader_.file_info_->is_mmap_mode) { + if (!file_reader_.file_info()->is_mmap_mode) { parsed_key->user_key = Slice(cur_key_.GetKey().data(), size); } if (internal_key != nullptr) { @@ -394,7 +409,7 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey( if (!s.ok()) { return s; } - if (!file_reader_.file_info_->is_mmap_mode) { + if (!file_reader_.file_info()->is_mmap_mode) { // In non-mmap mode, we need to make a copy of keys returned to // users, because after reading value for the key, the key might // be invalid. diff --git a/external/rocksdb/table/plain_table_key_coding.h b/external/rocksdb/table/plain_table_key_coding.h index a98010d5b7..ed4ce5d387 100644 --- a/external/rocksdb/table/plain_table_key_coding.h +++ b/external/rocksdb/table/plain_table_key_coding.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,8 +6,10 @@ #pragma once #ifndef ROCKSDB_LITE +#include #include "rocksdb/slice.h" #include "db/dbformat.h" +#include "table/plain_table_reader.h" namespace rocksdb { @@ -51,6 +53,74 @@ class PlainTableKeyEncoder { IterKey pre_prefix_; }; +class PlainTableFileReader { + public: + explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) + : file_info_(_file_info), num_buf_(0) {} + // In mmaped mode, the results point to mmaped area of the file, which + // means it is always valid before closing the file. + // In non-mmap mode, the results point to an internal buffer. If the caller + // makes another read call, the results may not be valid. So callers should + // make a copy when needed. + // In order to save read calls to files, we keep two internal buffers: + // the first read and the most recent read. This is efficient because it + // columns these two common use cases: + // (1) hash index only identify one location, we read the key to verify + // the location, and read key and value if it is the right location. + // (2) after hash index checking, we identify two locations (because of + // hash bucket conflicts), we binary search the two location to see + // which one is what we need and start to read from the location. + // These two most common use cases will be covered by the two buffers + // so that we don't need to re-read the same location. + // Currently we keep a fixed size buffer. If a read doesn't exactly fit + // the buffer, we replace the second buffer with the location user reads. + // + // If return false, status code is stored in status_. + bool Read(uint32_t file_offset, uint32_t len, Slice* out) { + if (file_info_->is_mmap_mode) { + assert(file_offset + len <= file_info_->data_end_offset); + *out = Slice(file_info_->file_data.data() + file_offset, len); + return true; + } else { + return ReadNonMmap(file_offset, len, out); + } + } + + // If return false, status code is stored in status_. + bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); + + // *bytes_read = 0 means eof. false means failure and status is saved + // in status_. Not directly returning Status to save copying status + // object to map previous performance of mmap mode. + inline bool ReadVarint32(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + Status status() const { return status_; } + + const PlainTableReaderFileInfo* file_info() { return file_info_; } + + private: + const PlainTableReaderFileInfo* file_info_; + + struct Buffer { + Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} + std::unique_ptr buf; + uint32_t buf_start_offset; + uint32_t buf_len; + uint32_t buf_capacity; + }; + + // Keep buffers for two recent reads. + std::array, 2> buffers_; + uint32_t num_buf_; + Status status_; + + Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); +}; + // A helper class to decode keys from input buffer // Actual data format of the key is documented in plain_table_factory.h class PlainTableKeyDecoder { @@ -82,43 +152,7 @@ class PlainTableKeyDecoder { Slice* internal_key, uint32_t* bytes_read, bool* seekable = nullptr); - class FileReader { - public: - explicit FileReader(const PlainTableReaderFileInfo* file_info) - : file_info_(file_info), - buf_start_offset_(0), - buf_len_(0), - buf_capacity_(0) {} - // In mmaped mode, the results point to mmaped area of the file, which - // means it is always valid before closing the file. - // In non-mmap mode, the results point to an internal buffer. If the caller - // makes another read call, the results will not be valid. So callers should - // make a copy when needed. - // If return false, status code is stored in status_. - inline bool Read(uint32_t file_offset, uint32_t len, Slice* output); - - // If return false, status code is stored in status_. - bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); - - // *bytes_read = 0 means eof. false means failure and status is saved - // in status_. Not directly returning Status to save copying status - // object to map previous performance of mmap mode. - inline bool ReadVarint32(uint32_t offset, uint32_t* output, - uint32_t* bytes_read); - - bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, - uint32_t* bytes_read); - - Status status() const { return status_; } - - const PlainTableReaderFileInfo* file_info_; - std::unique_ptr buf_; - uint32_t buf_start_offset_; - uint32_t buf_len_; - uint32_t buf_capacity_; - Status status_; - }; - FileReader file_reader_; + PlainTableFileReader file_reader_; EncodingType encoding_type_; uint32_t prefix_len_; uint32_t fixed_user_key_len_; diff --git a/external/rocksdb/table/plain_table_reader.cc b/external/rocksdb/table/plain_table_reader.cc index 1aabbb98f1..a5155254b9 100644 --- a/external/rocksdb/table/plain_table_reader.cc +++ b/external/rocksdb/table/plain_table_reader.cc @@ -22,6 +22,7 @@ #include "table/bloom_block.h" #include "table/filter_block.h" #include "table/format.h" +#include "table/internal_iterator.h" #include "table/meta_blocks.h" #include "table/two_level_iterator.h" #include "table/plain_table_factory.h" @@ -51,7 +52,7 @@ inline uint32_t GetFixed32Element(const char* base, size_t offset) { } // namespace // Iterator to iterate IndexedTable -class PlainTableIterator : public Iterator { +class PlainTableIterator : public InternalIterator { public: explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); ~PlainTableIterator(); @@ -127,7 +128,7 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions.env, ioptions.info_log, &props); + ioptions, &props); if (!s.ok()) { return s; } @@ -186,10 +187,11 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, void PlainTableReader::SetupForCompaction() { } -Iterator* PlainTableReader::NewIterator(const ReadOptions& options, - Arena* arena) { +InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options, + Arena* arena, + bool skip_filters) { if (options.total_order_seek && !IsTotalOrderMode()) { - return NewErrorIterator( + return NewErrorInternalIterator( Status::InvalidArgument("total_order_seek not supported"), arena); } if (arena == nullptr) { @@ -291,13 +293,13 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents bloom_block_contents; auto s = ReadMetaBlock(file_info_.file.get(), file_size_, - kPlainTableMagicNumber, ioptions_.env, + kPlainTableMagicNumber, ioptions_, BloomBlockBuilder::kBloomBlock, &bloom_block_contents); bool index_in_file = s.ok(); BlockContents index_block_contents; s = ReadMetaBlock( - file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_.env, + file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_contents); index_in_file &= s.ok(); @@ -408,7 +410,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, return Status::OK(); } -Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, +Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, + const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, uint32_t* offset) const { prefix_matched = false; @@ -434,15 +437,12 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, return Status::Corruption(Slice()); } - PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, - ioptions_.prefix_extractor); - // The key is between [low, high). Do a binary search between it. while (high - low > 1) { uint32_t mid = (high + low) / 2; uint32_t file_offset = GetFixed32Element(base_ptr, mid); uint32_t tmp; - Status s = decoder.NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); + Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); if (!s.ok()) { return s; } @@ -467,7 +467,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, ParsedInternalKey low_key; uint32_t tmp; uint32_t low_key_offset = GetFixed32Element(base_ptr, low); - Status s = decoder.NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); + Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); if (!s.ok()) { return s; } @@ -532,7 +532,7 @@ void PlainTableReader::Prepare(const Slice& target) { } Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, - GetContext* get_context) { + GetContext* get_context, bool skip_filters) { // Check bloom filter first. Slice prefix_slice; uint32_t prefix_hash; @@ -558,8 +558,10 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } uint32_t offset; bool prefix_match; - Status s = - GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset); + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + ioptions_.prefix_extractor); + Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, + prefix_match, &offset); if (!s.ok()) { return s; @@ -570,8 +572,6 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, return Status::Corruption(Slice()); } Slice found_value; - PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, - ioptions_.prefix_extractor); while (offset < file_info_.data_end_offset) { s = Next(&decoder, &offset, &found_key, nullptr, &found_value); if (!s.ok()) { @@ -661,8 +661,8 @@ void PlainTableIterator::Seek(const Slice& target) { } } bool prefix_match; - status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, - &next_offset_); + status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, + prefix_match, &next_offset_); if (!status_.ok()) { offset_ = next_offset_ = table_->file_info_.data_end_offset; return; diff --git a/external/rocksdb/table/plain_table_reader.h b/external/rocksdb/table/plain_table_reader.h index b9d8cebba4..baa156d798 100644 --- a/external/rocksdb/table/plain_table_reader.h +++ b/external/rocksdb/table/plain_table_reader.h @@ -38,6 +38,7 @@ class TableReader; class InternalKeyComparator; class PlainTableKeyDecoder; class GetContext; +class InternalIterator; using std::unique_ptr; using std::unordered_map; @@ -77,12 +78,13 @@ class PlainTableReader: public TableReader { size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode); - Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; + InternalIterator* NewIterator(const ReadOptions&, Arena* arena = nullptr, + bool skip_filters = false) override; void Prepare(const Slice& target) override; - Status Get(const ReadOptions&, const Slice& key, - GetContext* get_context) override; + Status Get(const ReadOptions&, const Slice& key, GetContext* get_context, + bool skip_filters = false) override; uint64_t ApproximateOffsetOf(const Slice& key) override; @@ -217,9 +219,9 @@ class PlainTableReader: public TableReader { // Get file offset for key target. // return value prefix_matched is set to true if the offset is confirmed // for a key with the same prefix as target. - Status GetOffset(const Slice& target, const Slice& prefix, - uint32_t prefix_hash, bool& prefix_matched, - uint32_t* offset) const; + Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target, + const Slice& prefix, uint32_t prefix_hash, + bool& prefix_matched, uint32_t* offset) const; bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } diff --git a/external/rocksdb/util/scoped_arena_iterator.h b/external/rocksdb/table/scoped_arena_iterator.h similarity index 56% rename from external/rocksdb/util/scoped_arena_iterator.h rename to external/rocksdb/table/scoped_arena_iterator.h index 2021d2dc22..5629ba5aac 100644 --- a/external/rocksdb/util/scoped_arena_iterator.h +++ b/external/rocksdb/table/scoped_arena_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,22 +7,23 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "rocksdb/iterator.h" +#include "table/internal_iterator.h" namespace rocksdb { class ScopedArenaIterator { public: - explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {} + explicit ScopedArenaIterator(InternalIterator* iter = nullptr) + : iter_(iter) {} - Iterator* operator->() { return iter_; } + InternalIterator* operator->() { return iter_; } - void set(Iterator* iter) { iter_ = iter; } + void set(InternalIterator* iter) { iter_ = iter; } - Iterator* get() { return iter_; } + InternalIterator* get() { return iter_; } - ~ScopedArenaIterator() { iter_->~Iterator(); } + ~ScopedArenaIterator() { iter_->~InternalIterator(); } private: - Iterator* iter_; + InternalIterator* iter_; }; } // namespace rocksdb diff --git a/external/rocksdb/table/sst_file_writer.cc b/external/rocksdb/table/sst_file_writer.cc index d780f0a4b4..98f2d1f2dc 100644 --- a/external/rocksdb/table/sst_file_writer.cc +++ b/external/rocksdb/table/sst_file_writer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -57,7 +57,8 @@ class SstFileWriter::SstFileWriterPropertiesCollectorFactory explicit SstFileWriterPropertiesCollectorFactory(int32_t version) : version_(version) {} - virtual IntTblPropCollector* CreateIntTblPropCollector() override { + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) override { return new SstFileWriterPropertiesCollector(version_); } @@ -70,24 +71,28 @@ class SstFileWriter::SstFileWriterPropertiesCollectorFactory }; struct SstFileWriter::Rep { - Rep(const EnvOptions& _env_options, const ImmutableCFOptions& _ioptions, + Rep(const EnvOptions& _env_options, const Options& options, const Comparator* _user_comparator) : env_options(_env_options), - ioptions(_ioptions), + ioptions(options), + mutable_cf_options(options, ioptions), internal_comparator(_user_comparator) {} std::unique_ptr file_writer; std::unique_ptr builder; EnvOptions env_options; ImmutableCFOptions ioptions; + MutableCFOptions mutable_cf_options; InternalKeyComparator internal_comparator; ExternalSstFileInfo file_info; + std::string column_family_name; + InternalKey ikey; }; SstFileWriter::SstFileWriter(const EnvOptions& env_options, - const ImmutableCFOptions& ioptions, + const Options& options, const Comparator* user_comparator) - : rep_(new Rep(env_options, ioptions, user_comparator)) {} + : rep_(new Rep(env_options, options, user_comparator)) {} SstFileWriter::~SstFileWriter() { delete rep_; } @@ -100,10 +105,14 @@ Status SstFileWriter::Open(const std::string& file_path) { return s; } - CompressionType compression_type = r->ioptions.compression; - if (!r->ioptions.compression_per_level.empty()) { + CompressionType compression_type; + if (r->ioptions.bottommost_compression != kDisableCompressionOption) { + compression_type = r->ioptions.bottommost_compression; + } else if (!r->ioptions.compression_per_level.empty()) { // Use the compression of the last level if we have per level compression compression_type = *(r->ioptions.compression_per_level.rbegin()); + } else { + compression_type = r->mutable_cf_options.compression; } std::vector> @@ -113,11 +122,15 @@ Status SstFileWriter::Open(const std::string& file_path) { TableBuilderOptions table_builder_options( r->ioptions, r->internal_comparator, &int_tbl_prop_collector_factories, - compression_type, r->ioptions.compression_opts, false); + compression_type, r->ioptions.compression_opts, + nullptr /* compression_dict */, false /* skip_filters */, + r->column_family_name); r->file_writer.reset( new WritableFileWriter(std::move(sst_file), r->env_options)); r->builder.reset(r->ioptions.table_factory->NewTableBuilder( - table_builder_options, r->file_writer.get())); + table_builder_options, + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + r->file_writer.get())); r->file_info.file_path = file_path; r->file_info.file_size = 0; @@ -134,7 +147,7 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { } if (r->file_info.num_entries == 0) { - r->file_info.smallest_key = user_key.ToString(); + r->file_info.smallest_key.assign(user_key.data(), user_key.size()); } else { if (r->internal_comparator.user_comparator()->Compare( user_key, r->file_info.largest_key) <= 0) { @@ -145,12 +158,12 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { // update file info r->file_info.num_entries++; - r->file_info.largest_key = user_key.ToString(); + r->file_info.largest_key.assign(user_key.data(), user_key.size()); r->file_info.file_size = r->builder->FileSize(); - InternalKey ikey(user_key, 0 /* Sequence Number */, - ValueType::kTypeValue /* Put */); - r->builder->Add(ikey.Encode(), value); + r->ikey.Set(user_key, 0 /* Sequence Number */, + ValueType::kTypeValue /* Put */); + r->builder->Add(r->ikey.Encode(), value); return Status::OK(); } @@ -160,6 +173,9 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { if (!r->builder) { return Status::InvalidArgument("File is not opened"); } + if (r->file_info.num_entries == 0) { + return Status::InvalidArgument("Cannot create sst file with no entries"); + } Status s = r->builder->Finish(); if (s.ok()) { diff --git a/external/rocksdb/table/table_builder.h b/external/rocksdb/table/table_builder.h index 55a1077fa7..be19636f6d 100644 --- a/external/rocksdb/table/table_builder.h +++ b/external/rocksdb/table/table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,16 +25,24 @@ class Slice; class Status; struct TableReaderOptions { + // @param skip_filters Disables loading/accessing the filter block TableReaderOptions(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, - const InternalKeyComparator& _internal_comparator) + const InternalKeyComparator& _internal_comparator, + bool _skip_filters = false, int _level = -1) : ioptions(_ioptions), env_options(_env_options), - internal_comparator(_internal_comparator) {} + internal_comparator(_internal_comparator), + skip_filters(_skip_filters), + level(_level) {} const ImmutableCFOptions& ioptions; const EnvOptions& env_options; const InternalKeyComparator& internal_comparator; + // This is only used for BlockBasedTable (reader) + bool skip_filters; + // what level this table/file is on, -1 for "not set, don't know" + int level; }; struct TableBuilderOptions { @@ -44,20 +52,27 @@ struct TableBuilderOptions { const std::vector>* _int_tbl_prop_collector_factories, CompressionType _compression_type, - const CompressionOptions& _compression_opts, bool _skip_filters) + const CompressionOptions& _compression_opts, + const std::string* _compression_dict, bool _skip_filters, + const std::string& _column_family_name) : ioptions(_ioptions), internal_comparator(_internal_comparator), int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), compression_type(_compression_type), compression_opts(_compression_opts), - skip_filters(_skip_filters) {} + compression_dict(_compression_dict), + skip_filters(_skip_filters), + column_family_name(_column_family_name) {} const ImmutableCFOptions& ioptions; const InternalKeyComparator& internal_comparator; const std::vector>* int_tbl_prop_collector_factories; CompressionType compression_type; const CompressionOptions& compression_opts; - bool skip_filters = false; + // Data for presetting the compression library's dictionary, or nullptr. + const std::string* compression_dict; + bool skip_filters; // only used by BlockBasedTableBuilder + const std::string& column_family_name; }; // TableBuilder provides the interface used to build a Table diff --git a/external/rocksdb/table/table_properties.cc b/external/rocksdb/table/table_properties.cc index 86c084385f..12e9054ada 100644 --- a/external/rocksdb/table/table_properties.cc +++ b/external/rocksdb/table/table_properties.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,10 +8,14 @@ #include "rocksdb/iterator.h" #include "rocksdb/env.h" #include "port/port.h" +#include "table/internal_iterator.h" #include "util/string_util.h" namespace rocksdb { +const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily = + port::kMaxInt32; + namespace { void AppendProperty( std::string& props, @@ -36,6 +40,19 @@ namespace { props, key, ToString(value), prop_delim, kv_delim ); } + + // Seek to the specified meta block. + // Return true if it successfully seeks to that block. + Status SeekToMetaBlock(InternalIterator* meta_iter, + const std::string& block_name, bool* is_found) { + *is_found = true; + meta_iter->Seek(block_name); + if (meta_iter->status().ok() && + (!meta_iter->Valid() || meta_iter->key() != block_name)) { + *is_found = false; + } + return meta_iter->status(); + } } std::string TableProperties::ToString( @@ -71,6 +88,36 @@ std::string TableProperties::ToString( filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, prop_delim, kv_delim); + AppendProperty(result, "column family ID", + column_family_id == rocksdb::TablePropertiesCollectorFactory:: + Context::kUnknownColumnFamily + ? std::string("N/A") + : rocksdb::ToString(column_family_id), + prop_delim, kv_delim); + AppendProperty( + result, "column family name", + column_family_name.empty() ? std::string("N/A") : column_family_name, + prop_delim, kv_delim); + + AppendProperty(result, "comparator name", + comparator_name.empty() ? std::string("N/A") : comparator_name, + prop_delim, kv_delim); + + AppendProperty( + result, "merge operator name", + merge_operator_name.empty() ? std::string("N/A") : merge_operator_name, + prop_delim, kv_delim); + + AppendProperty(result, "property collectors names", + property_collectors_names.empty() ? std::string("N/A") + : property_collectors_names, + prop_delim, kv_delim); + + AppendProperty( + result, "SST file compression algo", + compression_name.empty() ? std::string("N/A") : compression_name, + prop_delim, kv_delim); + return result; } @@ -104,25 +151,36 @@ const std::string TablePropertiesNames::kFormatVersion = "rocksdb.format.version"; const std::string TablePropertiesNames::kFixedKeyLen = "rocksdb.fixed.key.length"; +const std::string TablePropertiesNames::kColumnFamilyId = + "rocksdb.column.family.id"; +const std::string TablePropertiesNames::kColumnFamilyName = + "rocksdb.column.family.name"; +const std::string TablePropertiesNames::kComparator = "rocksdb.comparator"; +const std::string TablePropertiesNames::kMergeOperator = + "rocksdb.merge.operator"; +const std::string TablePropertiesNames::kPropertyCollectors = + "rocksdb.property.collectors"; +const std::string TablePropertiesNames::kCompression = "rocksdb.compression"; extern const std::string kPropertiesBlock = "rocksdb.properties"; // Old property block name for backward compatibility extern const std::string kPropertiesBlockOldName = "rocksdb.stats"; +extern const std::string kCompressionDictBlock = "rocksdb.compression_dict"; // Seek to the properties block. // Return true if it successfully seeks to the properties block. -Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found) { - *is_found = true; - meta_iter->Seek(kPropertiesBlock); - if (meta_iter->status().ok() && - (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlock)) { - meta_iter->Seek(kPropertiesBlockOldName); - if (meta_iter->status().ok() && - (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlockOldName)) { - *is_found = false; - } +Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) { + Status status = SeekToMetaBlock(meta_iter, kPropertiesBlock, is_found); + if (!*is_found && status.ok()) { + status = SeekToMetaBlock(meta_iter, kPropertiesBlockOldName, is_found); } - return meta_iter->status(); + return status; +} + +// Seek to the compression dictionary block. +// Return true if it successfully seeks to that block. +Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found) { + return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found); } } // namespace rocksdb diff --git a/external/rocksdb/table/table_properties_internal.h b/external/rocksdb/table/table_properties_internal.h index 9ef8ad4329..3d3a4b5f8b 100644 --- a/external/rocksdb/table/table_properties_internal.h +++ b/external/rocksdb/table/table_properties_internal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,9 +10,16 @@ namespace rocksdb { +class InternalIterator; + // Seek to the properties block. // If it successfully seeks to the properties block, "is_found" will be // set to true. -Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found); +Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found); + +// Seek to the compression dictionary block. +// If it successfully seeks to the properties block, "is_found" will be +// set to true. +Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found); } // namespace rocksdb diff --git a/external/rocksdb/table/table_reader.h b/external/rocksdb/table/table_reader.h index 2058b868c8..c047bf8cb2 100644 --- a/external/rocksdb/table/table_reader.h +++ b/external/rocksdb/table/table_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,7 @@ class Arena; struct ReadOptions; struct TableProperties; class GetContext; +class InternalIterator; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from @@ -34,7 +35,11 @@ class TableReader { // When destroying the iterator, the caller will not call "delete" // but Iterator::~Iterator() directly. The destructor needs to destroy // all the states but those allocated in arena. - virtual Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) = 0; + // skip_filters: disables checking the bloom filters even if they exist. This + // option is effective only for block-based table format. + virtual InternalIterator* NewIterator(const ReadOptions&, + Arena* arena = nullptr, + bool skip_filters = false) = 0; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -65,8 +70,10 @@ class TableReader { // // readOptions is the options for the read // key is the key to search for + // skip_filters: disables checking the bloom filters even if they exist. This + // option is effective only for block-based table format. virtual Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context) = 0; + GetContext* get_context, bool skip_filters = false) = 0; // Prefetch data corresponding to a give range of keys // Typically this functionality is required for table implementations that @@ -84,6 +91,8 @@ class TableReader { virtual Status DumpTable(WritableFile* out_file) { return Status::NotSupported("DumpTable() not supported"); } + + virtual void Close() {} }; } // namespace rocksdb diff --git a/external/rocksdb/table/table_reader_bench.cc b/external/rocksdb/table/table_reader_bench.cc index e3baa29ed5..0a227d1e38 100644 --- a/external/rocksdb/table/table_reader_bench.cc +++ b/external/rocksdb/table/table_reader_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,7 @@ int main() { #include "db/db_impl.h" #include "db/dbformat.h" #include "table/block_based_table_factory.h" +#include "table/internal_iterator.h" #include "table/plain_table_factory.h" #include "table/table_builder.h" #include "table/get_context.h" @@ -97,8 +98,10 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, tb = opts.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, CompressionType::kNoCompression, - CompressionOptions(), false), - file_writer.get()); + CompressionOptions(), + nullptr /* compression_dict */, + false /* skip_filters */, kDefaultColumnFamilyName), + 0 /* column_family_id */, file_writer.get()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -187,20 +190,24 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string end_key = MakeKey(r1, r2 + r2_len, through_db); uint64_t total_time = 0; uint64_t start_time = Now(env, measured_by_nanosecond); - Iterator* iter; + Iterator* iter = nullptr; + InternalIterator* iiter = nullptr; if (!through_db) { - iter = table_reader->NewIterator(read_options); + iiter = table_reader->NewIterator(read_options); } else { iter = db->NewIterator(read_options); } int count = 0; - for(iter->Seek(start_key); iter->Valid(); iter->Next()) { + for (through_db ? iter->Seek(start_key) : iiter->Seek(start_key); + through_db ? iter->Valid() : iiter->Valid(); + through_db ? iter->Next() : iiter->Next()) { if (if_query_empty_keys) { break; } // verify key; total_time += Now(env, measured_by_nanosecond) - start_time; - assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key()); + assert(Slice(MakeKey(r1, r2 + count, through_db)) == + (through_db ? iter->key() : iiter->key())); start_time = Now(env, measured_by_nanosecond); if (++count >= r2_len) { break; @@ -254,6 +261,7 @@ DEFINE_bool(iterator, false, "For test iterator"); DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " "the query will be against DB. Otherwise, will be directly against " "a table reader."); +DEFINE_bool(mmap_read, true, "Whether use mmap read"); DEFINE_string(table_factory, "block_based", "Table factory to use: `block_based` (default), `plain_table` or " "`cuckoo_hash`."); @@ -279,8 +287,8 @@ int main(int argc, char** argv) { if (FLAGS_table_factory == "cuckoo_hash") { #ifndef ROCKSDB_LITE - options.allow_mmap_reads = true; - env_options.use_mmap_reads = true; + options.allow_mmap_reads = FLAGS_mmap_read; + env_options.use_mmap_reads = FLAGS_mmap_read; rocksdb::CuckooTableOptions table_options; table_options.hash_table_ratio = 0.75; tf.reset(rocksdb::NewCuckooTableFactory(table_options)); @@ -290,8 +298,8 @@ int main(int argc, char** argv) { #endif // ROCKSDB_LITE } else if (FLAGS_table_factory == "plain_table") { #ifndef ROCKSDB_LITE - options.allow_mmap_reads = true; - env_options.use_mmap_reads = true; + options.allow_mmap_reads = FLAGS_mmap_read; + env_options.use_mmap_reads = FLAGS_mmap_read; rocksdb::PlainTableOptions plain_table_options; plain_table_options.user_key_len = 16; diff --git a/external/rocksdb/table/table_test.cc b/external/rocksdb/table/table_test.cc index e21503b8ff..1f930f97e5 100644 --- a/external/rocksdb/table/table_test.cc +++ b/external/rocksdb/table/table_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,7 +20,7 @@ #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" -#include "db/writebuffer.h" +#include "memtable/stl_wrappers.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -29,6 +29,7 @@ #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" +#include "rocksdb/write_buffer_manager.h" #include "table/block.h" #include "table/block_based_table_builder.h" #include "table/block_based_table_factory.h" @@ -36,16 +37,18 @@ #include "table/block_builder.h" #include "table/format.h" #include "table/get_context.h" +#include "table/internal_iterator.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" #include "util/compression.h" #include "util/random.h" -#include "util/scoped_arena_iterator.h" #include "util/statistics.h" -#include "util/stl_wrappers.h" #include "util/string_util.h" +#include "util/sync_point.h" #include "util/testharness.h" #include "util/testutil.h" +#include "utilities/merge_operators.h" namespace rocksdb { @@ -56,6 +59,40 @@ extern const uint64_t kPlainTableMagicNumber; namespace { +// DummyPropertiesCollector used to test BlockBasedTableProperties +class DummyPropertiesCollector : public TablePropertiesCollector { + public: + const char* Name() const { return ""; } + + Status Finish(UserCollectedProperties* properties) { return Status::OK(); } + + Status Add(const Slice& user_key, const Slice& value) { return Status::OK(); } + + virtual UserCollectedProperties GetReadableProperties() const { + return UserCollectedProperties{}; + } +}; + +class DummyPropertiesCollectorFactory1 + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) { + return new DummyPropertiesCollector(); + } + const char* Name() const { return "DummyPropertiesCollector1"; } +}; + +class DummyPropertiesCollectorFactory2 + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) { + return new DummyPropertiesCollector(); + } + const char* Name() const { return "DummyPropertiesCollector2"; } +}; + // Return reverse of "key". // Used to test non-lexicographic comparators. std::string Reverse(const Slice& key) { @@ -142,7 +179,7 @@ class Constructor { const InternalKeyComparator& internal_comparator, const stl_wrappers::KVMap& data) = 0; - virtual Iterator* NewIterator() const = 0; + virtual InternalIterator* NewIterator() const = 0; virtual const stl_wrappers::KVMap& data() { return data_; } @@ -188,7 +225,7 @@ class BlockConstructor: public Constructor { block_ = new Block(std::move(contents)); return Status::OK(); } - virtual Iterator* NewIterator() const override { + virtual InternalIterator* NewIterator() const override { return block_->NewIterator(comparator_); } @@ -201,13 +238,14 @@ class BlockConstructor: public Constructor { }; // A helper class that converts internal format keys into user keys -class KeyConvertingIterator: public Iterator { +class KeyConvertingIterator : public InternalIterator { public: - explicit KeyConvertingIterator(Iterator* iter, bool arena_mode = false) + explicit KeyConvertingIterator(InternalIterator* iter, + bool arena_mode = false) : iter_(iter), arena_mode_(arena_mode) {} virtual ~KeyConvertingIterator() { if (arena_mode_) { - iter_->~Iterator(); + iter_->~InternalIterator(); } else { delete iter_; } @@ -241,7 +279,7 @@ class KeyConvertingIterator: public Iterator { private: mutable Status status_; - Iterator* iter_; + InternalIterator* iter_; bool arena_mode_; // No copying allowed @@ -268,10 +306,14 @@ class TableConstructor: public Constructor { unique_ptr builder; std::vector> int_tbl_prop_collector_factories; + std::string column_family_name; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, internal_comparator, &int_tbl_prop_collector_factories, - options.compression, CompressionOptions(), false), + options.compression, CompressionOptions(), + nullptr /* compression_dict */, + false /* skip_filters */, column_family_name), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, file_writer_.get())); for (const auto kv : kv_map) { @@ -300,9 +342,9 @@ class TableConstructor: public Constructor { std::move(file_reader_), GetSink()->contents().size(), &table_reader_); } - virtual Iterator* NewIterator() const override { + virtual InternalIterator* NewIterator() const override { ReadOptions ro; - Iterator* iter = table_reader_->NewIterator(ro); + InternalIterator* iter = table_reader_->NewIterator(ro); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); } else { @@ -330,6 +372,8 @@ class TableConstructor: public Constructor { return convert_to_internal_key_; } + void ResetTableReader() { table_reader_.reset(); } + private: void Reset() { uniq_id_ = 0; @@ -357,10 +401,10 @@ uint64_t TableConstructor::cur_uniq_id_ = 1; class MemTableConstructor: public Constructor { public: - explicit MemTableConstructor(const Comparator* cmp, WriteBuffer* wb) + explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb) : Constructor(cmp), internal_comparator_(cmp), - write_buffer_(wb), + write_buffer_manager_(wb), table_factory_(new SkipListFactory) { options_.memtable_factory = table_factory_; ImmutableCFOptions ioptions(options_); @@ -380,7 +424,7 @@ class MemTableConstructor: public Constructor { ImmutableCFOptions mem_ioptions(ioptions); memtable_ = new MemTable(internal_comparator_, mem_ioptions, MutableCFOptions(options_, mem_ioptions), - write_buffer_, kMaxSequenceNumber); + write_buffer_manager_, kMaxSequenceNumber); memtable_->Ref(); int seq = 1; for (const auto kv : kv_map) { @@ -389,7 +433,7 @@ class MemTableConstructor: public Constructor { } return Status::OK(); } - virtual Iterator* NewIterator() const override { + virtual InternalIterator* NewIterator() const override { return new KeyConvertingIterator( memtable_->NewIterator(ReadOptions(), &arena_), true); } @@ -402,11 +446,28 @@ class MemTableConstructor: public Constructor { mutable Arena arena_; InternalKeyComparator internal_comparator_; Options options_; - WriteBuffer* write_buffer_; + WriteBufferManager* write_buffer_manager_; MemTable* memtable_; std::shared_ptr table_factory_; }; +class InternalIteratorFromIterator : public InternalIterator { + public: + explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {} + virtual bool Valid() const override { return it_->Valid(); } + virtual void Seek(const Slice& target) override { it_->Seek(target); } + virtual void SeekToFirst() override { it_->SeekToFirst(); } + virtual void SeekToLast() override { it_->SeekToLast(); } + virtual void Next() override { it_->Next(); } + virtual void Prev() override { it_->Prev(); } + Slice key() const override { return it_->key(); } + Slice value() const override { return it_->value(); } + virtual Status status() const override { return it_->status(); } + + private: + unique_ptr it_; +}; + class DBConstructor: public Constructor { public: explicit DBConstructor(const Comparator* cmp) @@ -433,8 +494,9 @@ class DBConstructor: public Constructor { } return Status::OK(); } - virtual Iterator* NewIterator() const override { - return db_->NewIterator(ReadOptions()); + + virtual InternalIterator* NewIterator() const override { + return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions())); } virtual DB* db() const override { return db_; } @@ -514,6 +576,10 @@ static std::vector GenerateArgList() { compression_types.emplace_back(kLZ4HCCompression, false); compression_types.emplace_back(kLZ4HCCompression, true); } + if (XPRESS_Supported()) { + compression_types.emplace_back(kXpressCompression, false); + compression_types.emplace_back(kXpressCompression, true); + } if (ZSTD_Supported()) { compression_types.emplace_back(kZSTDNotFinalCompression, false); compression_types.emplace_back(kZSTDNotFinalCompression, true); @@ -616,6 +682,7 @@ class HarnessTest : public testing::Test { new FlushBlockBySizePolicyFactory()); table_options_.block_size = 256; table_options_.block_restart_interval = args.restart_interval; + table_options_.index_block_restart_interval = args.restart_interval; table_options_.format_version = args.format_version; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); @@ -704,7 +771,7 @@ class HarnessTest : public testing::Test { void TestForwardScan(const std::vector& keys, const stl_wrappers::KVMap& data) { - Iterator* iter = constructor_->NewIterator(); + InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); iter->SeekToFirst(); for (stl_wrappers::KVMap::const_iterator model_iter = data.begin(); @@ -714,7 +781,7 @@ class HarnessTest : public testing::Test { } ASSERT_TRUE(!iter->Valid()); if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { - iter->~Iterator(); + iter->~InternalIterator(); } else { delete iter; } @@ -722,7 +789,7 @@ class HarnessTest : public testing::Test { void TestBackwardScan(const std::vector& keys, const stl_wrappers::KVMap& data) { - Iterator* iter = constructor_->NewIterator(); + InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); iter->SeekToLast(); for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin(); @@ -732,7 +799,7 @@ class HarnessTest : public testing::Test { } ASSERT_TRUE(!iter->Valid()); if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { - iter->~Iterator(); + iter->~InternalIterator(); } else { delete iter; } @@ -741,7 +808,7 @@ class HarnessTest : public testing::Test { void TestRandomAccess(Random* rnd, const std::vector& keys, const stl_wrappers::KVMap& data) { static const bool kVerbose = false; - Iterator* iter = constructor_->NewIterator(); + InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); stl_wrappers::KVMap::const_iterator model_iter = data.begin(); if (kVerbose) fprintf(stderr, "---\n"); @@ -805,7 +872,7 @@ class HarnessTest : public testing::Test { } } if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { - iter->~Iterator(); + iter->~InternalIterator(); } else { delete iter; } @@ -829,7 +896,7 @@ class HarnessTest : public testing::Test { } } - std::string ToString(const Iterator* it) { + std::string ToString(const InternalIterator* it) { if (!it->Valid()) { return "END"; } else { @@ -875,7 +942,7 @@ class HarnessTest : public testing::Test { ImmutableCFOptions ioptions_; BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); Constructor* constructor_; - WriteBuffer write_buffer_; + WriteBufferManager write_buffer_; bool support_prev_; bool only_support_prefix_seek_; shared_ptr internal_comparator_; @@ -995,6 +1062,63 @@ TEST_F(BlockBasedTableTest, BasicBlockBasedTableProperties) { } Slice content = block_builder.Finish(); ASSERT_EQ(content.size() + kBlockTrailerSize, props.data_size); + c.ResetTableReader(); +} + +TEST_F(BlockBasedTableTest, BlockBasedTableProperties2) { + TableConstructor c(&reverse_key_comparator); + std::vector keys; + stl_wrappers::KVMap kvmap; + + { + Options options; + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + auto& props = *c.GetTableReader()->GetTableProperties(); + + // Default comparator + ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name); + // No merge operator + ASSERT_EQ("nullptr", props.merge_operator_name); + // No property collectors + ASSERT_EQ("[]", props.property_collectors_names); + // No filter policy is used + ASSERT_EQ("", props.filter_policy_name); + // Compression type == that set: + ASSERT_EQ("NoCompression", props.compression_name); + c.ResetTableReader(); + } + + { + Options options; + BlockBasedTableOptions table_options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.comparator = &reverse_key_comparator; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.table_properties_collector_factories.emplace_back( + new DummyPropertiesCollectorFactory1()); + options.table_properties_collector_factories.emplace_back( + new DummyPropertiesCollectorFactory2()); + + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + auto& props = *c.GetTableReader()->GetTableProperties(); + + ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name); + ASSERT_EQ("UInt64AddOperator", props.merge_operator_name); + ASSERT_EQ("[DummyPropertiesCollector1,DummyPropertiesCollector2]", + props.property_collectors_names); + ASSERT_EQ("", props.filter_policy_name); // no filter policy is used + c.ResetTableReader(); + } } TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) { @@ -1012,6 +1136,7 @@ TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) { GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); + c.ResetTableReader(); } // @@ -1037,7 +1162,7 @@ void PrefetchRange(TableConstructor* c, Options* opt, const std::vector& keys_not_in_cache, const Status expected_status = Status::OK()) { // reset the cache and reopen the table - table_options->block_cache = NewLRUCache(16 * 1024 * 1024); + table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt->table_factory.reset(NewBlockBasedTableFactory(*table_options)); const ImmutableCFOptions ioptions2(*opt); ASSERT_OK(c->Reopen(ioptions2)); @@ -1053,6 +1178,7 @@ void PrefetchRange(TableConstructor* c, Options* opt, // assert our expectation in cache warmup AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache); + c->ResetTableReader(); } TEST_F(BlockBasedTableTest, PrefetchTest) { @@ -1065,7 +1191,7 @@ TEST_F(BlockBasedTableTest, PrefetchTest) { BlockBasedTableOptions table_options; table_options.block_size = 1024; // big enough so we don't ever lose cached values. - table_options.block_cache = NewLRUCache(16 * 1024 * 1024); + table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); @@ -1080,6 +1206,7 @@ TEST_F(BlockBasedTableTest, PrefetchTest) { stl_wrappers::KVMap kvmap; const ImmutableCFOptions ioptions(opt); c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); + c.ResetTableReader(); // We get the following data spread : // @@ -1135,6 +1262,7 @@ TEST_F(BlockBasedTableTest, PrefetchTest) { PrefetchRange(&c, &opt, &table_options, keys, "k06", "k00", {}, {}, Status::InvalidArgument(Slice("k06 "), Slice("k07"))); + c.ResetTableReader(); } TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { @@ -1190,7 +1318,7 @@ TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { auto* reader = c.GetTableReader(); ReadOptions ro; ro.total_order_seek = true; - std::unique_ptr iter(reader->NewIterator(ro)); + std::unique_ptr iter(reader->NewIterator(ro)); iter->Seek(InternalKey("b", 0, kTypeValue).Encode()); ASSERT_OK(iter->status()); @@ -1221,6 +1349,41 @@ TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { } } +TEST_F(BlockBasedTableTest, NoopTransformSeek) { + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + + Options options; + options.comparator = BytewiseComparator(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewNoopTransform()); + + TableConstructor c(options.comparator); + // To tickle the PrefixMayMatch bug it is important that the + // user-key is a single byte so that the index key exactly matches + // the user-key. + InternalKey key("a", 1, kTypeValue); + c.Add(key.Encode().ToString(), "b"); + std::vector keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + c.Finish(options, ioptions, table_options, internal_comparator, &keys, + &kvmap); + + auto* reader = c.GetTableReader(); + for (int i = 0; i < 2; ++i) { + ReadOptions ro; + ro.total_order_seek = (i == 0); + std::unique_ptr iter(reader->NewIterator(ro)); + + iter->Seek(key.Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString()); + } +} + static std::string RandomString(Random* rnd, int len) { std::string r; test::RandomString(rnd, len, &r); @@ -1262,7 +1425,7 @@ TEST_F(TableTest, HashIndexTest) { table_options.index_type = BlockBasedTableOptions::kHashSearch; table_options.hash_index_allow_collision = true; table_options.block_size = 1700; - table_options.block_cache = NewLRUCache(1024); + table_options.block_cache = NewLRUCache(1024, 4); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::unique_ptr comparator( @@ -1274,7 +1437,8 @@ TEST_F(TableTest, HashIndexTest) { auto props = reader->GetTableProperties(); ASSERT_EQ(5u, props->num_data_blocks); - std::unique_ptr hash_iter(reader->NewIterator(ReadOptions())); + std::unique_ptr hash_iter( + reader->NewIterator(ReadOptions())); // -- Find keys do not exist, but have common prefix. std::vector prefixes = {"001", "003", "005", "007", "009"}; @@ -1342,6 +1506,7 @@ TEST_F(TableTest, HashIndexTest) { ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0); } } + c.ResetTableReader(); } // It's very hard to figure out the index block size of a block accurately. @@ -1382,6 +1547,7 @@ TEST_F(BlockBasedTableTest, IndexSizeStat) { auto index_size = c.GetTableReader()->GetTableProperties()->index_size; ASSERT_GT(index_size, last_index_size); last_index_size = index_size; + c.ResetTableReader(); } } @@ -1408,6 +1574,7 @@ TEST_F(BlockBasedTableTest, NumBlockStat) { GetPlainInternalComparator(options.comparator), &ks, &kvmap); ASSERT_EQ(kvmap.size(), c.GetTableReader()->GetTableProperties()->num_data_blocks); + c.ResetTableReader(); } // A simple tool that takes the snapshot of block cache statistics. @@ -1480,7 +1647,7 @@ TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) { options.create_if_missing = true; options.statistics = CreateDBStatistics(); BlockBasedTableOptions table_options; - table_options.block_cache = NewLRUCache(1024); + table_options.block_cache = NewLRUCache(1024, 4); table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys; @@ -1526,7 +1693,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { // Enable the cache for index/filter blocks BlockBasedTableOptions table_options; - table_options.block_cache = NewLRUCache(1024); + table_options.block_cache = NewLRUCache(1024, 4); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys; @@ -1544,7 +1711,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { // -- PART 1: Open with regular block cache. // Since block_cache is disabled, no cache activities will be involved. - unique_ptr iter; + unique_ptr iter; int64_t last_cache_bytes_read = 0; // At first, no block will be accessed. @@ -1599,15 +1766,16 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read); ASSERT_EQ(props.GetCacheBytesWrite(), table_options.block_cache->GetUsage()); - last_cache_bytes_read = props.GetCacheBytesRead(); } // release the iterator so that the block cache can reset correctly. iter.reset(); + c.ResetTableReader(); + // -- PART 2: Open with very small block cache // In this test, no block will ever get hit since the block cache is // too small to fit even one entry. - table_options.block_cache = NewLRUCache(1); + table_options.block_cache = NewLRUCache(1, 4); options.statistics = CreateDBStatistics(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); const ImmutableCFOptions ioptions2(options); @@ -1644,9 +1812,10 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { ASSERT_EQ(props.GetCacheBytesRead(), 0); } iter.reset(); + c.ResetTableReader(); // -- PART 3: Open table with bloom filter enabled but not in SST file - table_options.block_cache = NewLRUCache(4096); + table_options.block_cache = NewLRUCache(4096, 4); table_options.cache_index_and_filter_blocks = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1657,7 +1826,9 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { ImmutableCFOptions ioptions3(options); // Generate table without filter policy c3.Finish(options, ioptions3, table_options, - GetPlainInternalComparator(options.comparator), &keys, &kvmap); + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + c3.ResetTableReader(); + // Open table with filter policy table_options.filter_policy.reset(NewBloomFilterPolicy(1)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); @@ -1674,6 +1845,51 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { ASSERT_EQ(value, "hello"); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertFilterBlockStat(0, 0); + c3.ResetTableReader(); +} + +void ValidateBlockSizeDeviation(int value, int expected) { + BlockBasedTableOptions table_options; + table_options.block_size_deviation = value; + BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options); + + const BlockBasedTableOptions* normalized_table_options = + (const BlockBasedTableOptions*)factory->GetOptions(); + ASSERT_EQ(normalized_table_options->block_size_deviation, expected); + + delete factory; +} + +void ValidateBlockRestartInterval(int value, int expected) { + BlockBasedTableOptions table_options; + table_options.block_restart_interval = value; + BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options); + + const BlockBasedTableOptions* normalized_table_options = + (const BlockBasedTableOptions*)factory->GetOptions(); + ASSERT_EQ(normalized_table_options->block_restart_interval, expected); + + delete factory; +} + +TEST_F(BlockBasedTableTest, InvalidOptions) { + // invalid values for block_size_deviation (<0 or >100) are silently set to 0 + ValidateBlockSizeDeviation(-10, 0); + ValidateBlockSizeDeviation(-1, 0); + ValidateBlockSizeDeviation(0, 0); + ValidateBlockSizeDeviation(1, 1); + ValidateBlockSizeDeviation(99, 99); + ValidateBlockSizeDeviation(100, 100); + ValidateBlockSizeDeviation(101, 0); + ValidateBlockSizeDeviation(1000, 0); + + // invalid values for block_restart_interval (<1) are silently set to 1 + ValidateBlockRestartInterval(-10, 1); + ValidateBlockRestartInterval(-1, 1); + ValidateBlockRestartInterval(0, 1); + ValidateBlockRestartInterval(1, 1); + ValidateBlockRestartInterval(2, 2); + ValidateBlockRestartInterval(1000, 1000); } TEST_F(BlockBasedTableTest, BlockReadCountTest) { @@ -1761,7 +1977,7 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) { BlockBasedTableOptions table_options; table_options.block_size = 1024; // big enough so we don't ever lose cached values. - table_options.block_cache = NewLRUCache(16 * 1024 * 1024); + table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); @@ -1777,7 +1993,7 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) { const ImmutableCFOptions ioptions(opt); c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); - unique_ptr iter(c.NewIterator()); + unique_ptr iter(c.NewIterator()); iter->SeekToFirst(); while (iter->Valid()) { iter->key(); @@ -1792,9 +2008,10 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) { for (const std::string& key : keys) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); } + c.ResetTableReader(); // rerun with different block cache - table_options.block_cache = NewLRUCache(16 * 1024 * 1024); + table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); const ImmutableCFOptions ioptions2(opt); ASSERT_OK(c.Reopen(ioptions2)); @@ -1802,6 +2019,66 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) { for (const std::string& key : keys) { ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); } + c.ResetTableReader(); +} + +TEST_F(BlockBasedTableTest, NewIndexIteratorLeak) { + // A regression test to avoid data race described in + // https://github.com/facebook/rocksdb/issues/1267 + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Add("a1", "val1"); + Options options; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = NewLRUCache(0); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + { + {"BlockBasedTable::NewIndexIterator::thread1:1", + "BlockBasedTable::NewIndexIterator::thread2:2"}, + {"BlockBasedTable::NewIndexIterator::thread2:3", + "BlockBasedTable::NewIndexIterator::thread1:4"}, + }, + { + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", + "BlockBasedTable::NewIndexIterator::thread1:1"}, + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", + "BlockBasedTable::NewIndexIterator::thread1:4"}, + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", + "BlockBasedTable::NewIndexIterator::thread2:2"}, + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", + "BlockBasedTable::NewIndexIterator::thread2:3"}, + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ReadOptions ro; + auto* reader = c.GetTableReader(); + + std::function func1 = [&]() { + TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker"); + std::unique_ptr iter(reader->NewIterator(ro)); + iter->Seek(InternalKey("a1", 0, kTypeValue).Encode()); + }; + + std::function func2 = [&]() { + TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker"); + std::unique_ptr iter(reader->NewIterator(ro)); + }; + + auto thread1 = std::thread(func1); + auto thread2 = std::thread(func2); + thread1.join(); + thread2.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + c.ResetTableReader(); } // Plain table is not supported in ROCKSDB_LITE @@ -1821,9 +2098,13 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { InternalKeyComparator ikc(options.comparator); std::vector> int_tbl_prop_collector_factories; + std::string column_family_name; std::unique_ptr builder(factory.NewTableBuilder( TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, CompressionOptions(), false), + kNoCompression, CompressionOptions(), + nullptr /* compression_dict */, + false /* skip_filters */, column_family_name), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { @@ -1843,7 +2124,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { TableProperties* props = nullptr; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), - kPlainTableMagicNumber, Env::Default(), nullptr, + kPlainTableMagicNumber, ioptions, &props); std::unique_ptr props_guard(props); ASSERT_OK(s); @@ -1888,6 +2169,7 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); + c.ResetTableReader(); } static void DoCompressionTest(CompressionType comp) { @@ -1914,6 +2196,7 @@ static void DoCompressionTest(CompressionType comp) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6100)); + c.ResetTableReader(); } TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) { @@ -1946,6 +2229,13 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) { compression_state.push_back(kLZ4HCCompression); } + if (!XPRESS_Supported()) { + fprintf(stderr, "skipping xpress and xpress compression tests\n"); + } + else { + compression_state.push_back(kXpressCompression); + } + for (auto state : compression_state) { DoCompressionTest(state); } @@ -1972,9 +2262,10 @@ TEST_F(HarnessTest, Randomized) { } } +#ifndef ROCKSDB_LITE TEST_F(HarnessTest, RandomizedLongDB) { Random rnd(test::RandomSeed()); - TestArgs args = { DB_TEST, false, 16, kNoCompression, 0 }; + TestArgs args = {DB_TEST, false, 16, kNoCompression, 0, false}; Init(args); int num_entries = 100000; for (int e = 0; e < num_entries; e++) { @@ -1995,6 +2286,7 @@ TEST_F(HarnessTest, RandomizedLongDB) { } ASSERT_GT(files, 0); } +#endif // ROCKSDB_LITE class MemTableTest : public testing::Test {}; @@ -2004,7 +2296,7 @@ TEST_F(MemTableTest, Simple) { Options options; options.memtable_factory = table_factory; ImmutableCFOptions ioptions(options); - WriteBuffer wb(options.db_write_buffer_size); + WriteBufferManager wb(options.db_write_buffer_size); MemTable* memtable = new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb, kMaxSequenceNumber); @@ -2016,7 +2308,8 @@ TEST_F(MemTableTest, Simple) { batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("largekey"), std::string("vlarge")); ColumnFamilyMemTablesDefault cf_mems_default(memtable); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); + ASSERT_TRUE( + WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr).ok()); Arena arena; ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena)); @@ -2177,6 +2470,150 @@ TEST_F(HarnessTest, FooterTests) { } } +class IndexBlockRestartIntervalTest + : public BlockBasedTableTest, + public ::testing::WithParamInterface { + public: + static std::vector GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; } +}; + +INSTANTIATE_TEST_CASE_P( + IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest, + ::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues())); + +TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { + const int kKeysInTable = 10000; + const int kKeySize = 100; + const int kValSize = 500; + + int index_block_restart_interval = GetParam(); + + Options options; + BlockBasedTableOptions table_options; + table_options.block_size = 64; // small block size to get big index block + table_options.index_block_restart_interval = index_block_restart_interval; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator()); + static Random rnd(301); + for (int i = 0; i < kKeysInTable; i++) { + InternalKey k(RandomString(&rnd, kKeySize), 0, kTypeValue); + c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + } + + std::vector keys; + stl_wrappers::KVMap kvmap; + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap); + auto reader = c.GetTableReader(); + + std::unique_ptr db_iter(reader->NewIterator(ReadOptions())); + + // Test point lookup + for (auto& kv : kvmap) { + db_iter->Seek(kv.first); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), kv.first); + ASSERT_EQ(db_iter->value(), kv.second); + } + + // Test iterating + auto kv_iter = kvmap.begin(); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + ASSERT_EQ(db_iter->key(), kv_iter->first); + ASSERT_EQ(db_iter->value(), kv_iter->second); + kv_iter++; + } + ASSERT_EQ(kv_iter, kvmap.end()); + c.ResetTableReader(); +} + +class PrefixTest : public testing::Test { + public: + PrefixTest() : testing::Test() {} + ~PrefixTest() {} +}; + +namespace { +// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest +class TestPrefixExtractor : public rocksdb::SliceTransform { + public: + ~TestPrefixExtractor() override{}; + const char* Name() const override { return "TestPrefixExtractor"; } + + rocksdb::Slice Transform(const rocksdb::Slice& src) const override { + assert(IsValid(src)); + return rocksdb::Slice(src.data(), 3); + } + + bool InDomain(const rocksdb::Slice& src) const override { + assert(IsValid(src)); + return true; + } + + bool InRange(const rocksdb::Slice& dst) const override { return true; } + + bool IsValid(const rocksdb::Slice& src) const { + if (src.size() != 4) { + return false; + } + if (src[0] != '[') { + return false; + } + if (src[1] < '0' || src[1] > '9') { + return false; + } + if (src[2] != ']') { + return false; + } + if (src[3] < '0' || src[3] > '9') { + return false; + } + return true; + } +}; +} // namespace + +TEST_F(PrefixTest, PrefixAndWholeKeyTest) { + rocksdb::Options options; + options.compaction_style = rocksdb::kCompactionStyleUniversal; + options.num_levels = 20; + options.create_if_missing = true; + options.optimize_filters_for_hits = false; + options.target_file_size_base = 268435456; + options.prefix_extractor = std::make_shared(); + rocksdb::BlockBasedTableOptions bbto; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10)); + bbto.block_size = 262144; + + bbto.whole_key_filtering = true; + + const std::string kDBPath = test::TmpDir() + "/table_prefix_test"; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyDB(kDBPath, options); + rocksdb::DB* db; + ASSERT_OK(rocksdb::DB::Open(options, kDBPath, &db)); + + // Create a bunch of keys with 10 filters. + for (int i = 0; i < 10; i++) { + std::string prefix = "[" + std::to_string(i) + "]"; + for (int j = 0; j < 10; j++) { + std::string key = prefix + std::to_string(j); + db->Put(rocksdb::WriteOptions(), key, "1"); + } + } + + // Trigger compaction. + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + delete db; + // In the second round, turn whole_key_filtering off and expect + // rocksdb still works. +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/external/rocksdb/table/two_level_iterator.cc b/external/rocksdb/table/two_level_iterator.cc index f540d3b167..81dc8792ba 100644 --- a/external/rocksdb/table/two_level_iterator.cc +++ b/external/rocksdb/table/two_level_iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,6 +9,7 @@ #include "table/two_level_iterator.h" +#include "db/pinned_iterators_manager.h" #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block.h" @@ -19,13 +20,17 @@ namespace rocksdb { namespace { -class TwoLevelIterator: public Iterator { +class TwoLevelIterator : public InternalIterator { public: explicit TwoLevelIterator(TwoLevelIteratorState* state, - Iterator* first_level_iter, + InternalIterator* first_level_iter, bool need_free_iter_and_state); virtual ~TwoLevelIterator() { + // Assert that the TwoLevelIterator is never deleted while Pinning is + // Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); first_level_iter_.DeleteIter(!need_free_iter_and_state_); second_level_iter_.DeleteIter(false); if (need_free_iter_and_state_) { @@ -61,6 +66,22 @@ class TwoLevelIterator: public Iterator { return status_; } } + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + first_level_iter_.SetPinnedItersMgr(pinned_iters_mgr); + if (second_level_iter_.iter()) { + second_level_iter_.SetPinnedItersMgr(pinned_iters_mgr); + } + } + virtual bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + second_level_iter_.iter() && second_level_iter_.IsKeyPinned(); + } + virtual bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + second_level_iter_.iter() && second_level_iter_.IsValuePinned(); + } private: void SaveError(const Status& s) { @@ -68,13 +89,14 @@ class TwoLevelIterator: public Iterator { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(Iterator* iter); + void SetSecondLevelIterator(InternalIterator* iter); void InitDataBlock(); TwoLevelIteratorState* state_; IteratorWrapper first_level_iter_; IteratorWrapper second_level_iter_; // May be nullptr bool need_free_iter_and_state_; + PinnedIteratorsManager* pinned_iters_mgr_; Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. @@ -82,11 +104,12 @@ class TwoLevelIterator: public Iterator { }; TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state, - Iterator* first_level_iter, + InternalIterator* first_level_iter, bool need_free_iter_and_state) : state_(state), first_level_iter_(first_level_iter), - need_free_iter_and_state_(need_free_iter_and_state) {} + need_free_iter_and_state_(need_free_iter_and_state), + pinned_iters_mgr_(nullptr) {} void TwoLevelIterator::Seek(const Slice& target) { if (state_->check_prefix_may_match && @@ -168,11 +191,21 @@ void TwoLevelIterator::SkipEmptyDataBlocksBackward() { } } -void TwoLevelIterator::SetSecondLevelIterator(Iterator* iter) { +void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) { if (second_level_iter_.iter() != nullptr) { SaveError(second_level_iter_.status()); } - second_level_iter_.Set(iter); + + if (pinned_iters_mgr_ && iter) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + + InternalIterator* old_iter = second_level_iter_.Set(iter); + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIteratorIfNeeded(old_iter); + } else { + delete old_iter; + } } void TwoLevelIterator::InitDataBlock() { @@ -186,7 +219,7 @@ void TwoLevelIterator::InitDataBlock() { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - Iterator* iter = state_->NewSecondaryIterator(handle); + InternalIterator* iter = state_->NewSecondaryIterator(handle); data_block_handle_.assign(handle.data(), handle.size()); SetSecondLevelIterator(iter); } @@ -195,9 +228,10 @@ void TwoLevelIterator::InitDataBlock() { } // namespace -Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state, - Iterator* first_level_iter, Arena* arena, - bool need_free_iter_and_state) { +InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state, + InternalIterator* first_level_iter, + Arena* arena, + bool need_free_iter_and_state) { if (arena == nullptr) { return new TwoLevelIterator(state, first_level_iter, need_free_iter_and_state); diff --git a/external/rocksdb/table/two_level_iterator.h b/external/rocksdb/table/two_level_iterator.h index 4c6b48c2c2..d210132cbb 100644 --- a/external/rocksdb/table/two_level_iterator.h +++ b/external/rocksdb/table/two_level_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,7 +23,7 @@ struct TwoLevelIteratorState { : check_prefix_may_match(_check_prefix_may_match) {} virtual ~TwoLevelIteratorState() {} - virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0; + virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0; virtual bool PrefixMayMatch(const Slice& internal_key) = 0; // If call PrefixMayMatch() @@ -45,9 +45,8 @@ struct TwoLevelIteratorState { // all the states but those allocated in arena. // need_free_iter_and_state: free `state` and `first_level_iter` if // true. Otherwise, just call destructor. -extern Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state, - Iterator* first_level_iter, - Arena* arena = nullptr, - bool need_free_iter_and_state = true); +extern InternalIterator* NewTwoLevelIterator( + TwoLevelIteratorState* state, InternalIterator* first_level_iter, + Arena* arena = nullptr, bool need_free_iter_and_state = true); } // namespace rocksdb diff --git a/external/rocksdb/third-party/fbson/FbsonDocument.h b/external/rocksdb/third-party/fbson/FbsonDocument.h index c70f9ecb2a..9a00e24714 100644 --- a/external/rocksdb/third-party/fbson/FbsonDocument.h +++ b/external/rocksdb/third-party/fbson/FbsonDocument.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/external/rocksdb/third-party/fbson/FbsonJsonParser.h b/external/rocksdb/third-party/fbson/FbsonJsonParser.h index 3525b68b5b..678d970fb0 100644 --- a/external/rocksdb/third-party/fbson/FbsonJsonParser.h +++ b/external/rocksdb/third-party/fbson/FbsonJsonParser.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the @@ -40,7 +40,7 @@ * string to an id, and store the dictionary id in FBSON to save space. The * purpose of using an external dictionary is more towards a collection of * documents (which has common keys) rather than a single document, so that - * space saving will be siginificant. + * space saving will be significant. * * ** Endianness ** * Note: FBSON serialization doesn't assume endianness of the server. However @@ -567,7 +567,7 @@ class FbsonJsonParserT { ++precision; } else if (ch == '.') { // note we don't pop out '.' - return parseDouble(in, val, precision, sign); + return parseDouble(in, static_cast(val), precision, sign); } else { err_ = FbsonErrType::E_INVALID_DECIMAL; return false; @@ -578,7 +578,7 @@ class FbsonJsonParserT { // if the number overflows int64_t, first parse it as double iff we see a // decimal point later. Otherwise, will treat it as overflow if (val < 0 && val > std::numeric_limits::min()) { - return parseDouble(in, (uint64_t)val, precision, sign); + return parseDouble(in, static_cast(val), precision, sign); } ch = in.peek(); diff --git a/external/rocksdb/third-party/fbson/FbsonStream.h b/external/rocksdb/third-party/fbson/FbsonStream.h index 22851240d3..5f70221db5 100644 --- a/external/rocksdb/third-party/fbson/FbsonStream.h +++ b/external/rocksdb/third-party/fbson/FbsonStream.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/external/rocksdb/third-party/fbson/FbsonUtil.h b/external/rocksdb/third-party/fbson/FbsonUtil.h index ab965630d2..2c41547699 100644 --- a/external/rocksdb/third-party/fbson/FbsonUtil.h +++ b/external/rocksdb/third-party/fbson/FbsonUtil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/external/rocksdb/third-party/fbson/FbsonWriter.h b/external/rocksdb/third-party/fbson/FbsonWriter.h index 21bd6f232b..4efaf817c2 100644 --- a/external/rocksdb/third-party/fbson/FbsonWriter.h +++ b/external/rocksdb/third-party/fbson/FbsonWriter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc b/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc index 92c3a439b7..154ec6f26b 100644 --- a/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc +++ b/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc @@ -34,6 +34,9 @@ // Sometimes it's desirable to build Google Test by compiling a single file. // This file serves this purpose. +// Suppress clang analyzer warnings. +#ifndef __clang_analyzer__ + // This line ensures that gtest.h can be compiled on its own, even // when it's fused. #include "gtest/gtest.h" @@ -109,7 +112,6 @@ #ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ #define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ - namespace testing { // This helper class can be used to mock out Google Test failure reporting @@ -2592,7 +2594,7 @@ class Hunk { // Print a unified diff header for one hunk. // The format is // "@@ -, +, @@" - // where the left/right parts are ommitted if unnecessary. + // where the left/right parts are omitted if unnecessary. void PrintHeader(std::ostream* ss) const { *ss << "@@ "; if (removes_) { @@ -10255,3 +10257,5 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames( } // namespace internal } // namespace testing + +#endif // __clang_analyzer__ diff --git a/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h b/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h index 2756b47d55..e3f0cfb95c 100644 --- a/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h +++ b/external/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h @@ -7682,7 +7682,7 @@ namespace edit_distance { // Returns the optimal edits to go from 'left' to 'right'. // All edits cost the same, with replace having lower priority than // add/remove. -// Simple implementation of the Wagner–Fischer algorithm. +// Simple implementation of the Wagner-Fischer algorithm. // See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm enum EditType { kMatch, kAdd, kRemove, kReplace }; GTEST_API_ std::vector CalculateOptimalEdits( @@ -17586,7 +17586,7 @@ internal::CartesianProductHolder10()); \ return 0; \ } \ - static int gtest_registering_dummy_; \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ GTEST_DISALLOW_COPY_AND_ASSIGN_(\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ }; \ diff --git a/external/rocksdb/thirdparty.inc b/external/rocksdb/thirdparty.inc index 448c49ac87..9fffd9bff0 100644 --- a/external/rocksdb/thirdparty.inc +++ b/external/rocksdb/thirdparty.inc @@ -7,7 +7,9 @@ set(USE_GFLAGS_DEFAULT 0) # GFLAGS is disabled by default, enable with -D set(USE_SNAPPY_DEFAULT 0) # SNAPPY is disabled by default, enable with -DSNAPPY=1 cmake command line agrument set(USE_LZ4_DEFAULT 0) # LZ4 is disabled by default, enable with -DLZ4=1 cmake command line agrument set(USE_ZLIB_DEFAULT 0) # ZLIB is disabled by default, enable with -DZLIB=1 cmake command line agrument +set(USE_XPRESS_DEFAULT 0) # XPRESS is disabled by default, enable with -DXPRESS=1 cmake command line agrument set(USE_JEMALLOC_DEFAULT 0) # JEMALLOC is disabled by default, enable with -DJEMALLOC=1 cmake command line agrument +set(USE_JENONINIT_DEFAULT 1) # Default is enabled do not call je_init/je_uninit as the newer versions do not have it disable with -DJENONINIT=0 # # This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable @@ -36,6 +38,19 @@ endif () if (${USE_GFLAGS} EQUAL 1) message(STATUS "GFLAGS library is enabled") + + if(DEFINED ENV{GFLAGS_INCLUDE}) + set(GFLAGS_INCLUDE $ENV{GFLAGS_INCLUDE}) + endif() + + if(DEFINED ENV{GFLAGS_LIB_DEBUG}) + set(GFLAGS_LIB_DEBUG $ENV{GFLAGS_LIB_DEBUG}) + endif() + + if(DEFINED ENV{GFLAGS_LIB_RELEASE}) + set(GFLAGS_LIB_RELEASE $ENV{GFLAGS_LIB_RELEASE}) + endif() + set(GFLAGS_CXX_FLAGS -DGFLAGS=gflags) set(GFLAGS_LIBS debug ${GFLAGS_LIB_DEBUG} optimized ${GFLAGS_LIB_RELEASE}) @@ -66,6 +81,19 @@ endif () if (${USE_SNAPPY} EQUAL 1) message(STATUS "SNAPPY library is enabled") + + if(DEFINED ENV{SNAPPY_INCLUDE}) + set(SNAPPY_INCLUDE $ENV{SNAPPY_INCLUDE}) + endif() + + if(DEFINED ENV{SNAPPY_LIB_DEBUG}) + set(SNAPPY_LIB_DEBUG $ENV{SNAPPY_LIB_DEBUG}) + endif() + + if(DEFINED ENV{SNAPPY_LIB_RELEASE}) + set(SNAPPY_LIB_RELEASE $ENV{SNAPPY_LIB_RELEASE}) + endif() + set(SNAPPY_CXX_FLAGS -DSNAPPY) set(SNAPPY_LIBS debug ${SNAPPY_LIB_DEBUG} optimized ${SNAPPY_LIB_RELEASE}) @@ -96,6 +124,19 @@ endif () if (${USE_LZ4} EQUAL 1) message(STATUS "LZ4 library is enabled") + + if(DEFINED ENV{LZ4_INCLUDE}) + set(LZ4_INCLUDE $ENV{LZ4_INCLUDE}) + endif() + + if(DEFINED ENV{LZ4_LIB_DEBUG}) + set(LZ4_LIB_DEBUG $ENV{LZ4_LIB_DEBUG}) + endif() + + if(DEFINED ENV{LZ4_LIB_RELEASE}) + set(LZ4_LIB_RELEASE $ENV{LZ4_LIB_RELEASE}) + endif() + set(LZ4_CXX_FLAGS -DLZ4) set(LZ4_LIBS debug ${LZ4_LIB_DEBUG} optimized ${LZ4_LIB_RELEASE}) @@ -126,6 +167,19 @@ endif () if (${USE_ZLIB} EQUAL 1) message(STATUS "ZLIB library is enabled") + + if(DEFINED ENV{ZLIB_INCLUDE}) + set(ZLIB_INCLUDE $ENV{ZLIB_INCLUDE}) + endif() + + if(DEFINED ENV{ZLIB_LIB_DEBUG}) + set(ZLIB_LIB_DEBUG $ENV{ZLIB_LIB_DEBUG}) + endif() + + if(DEFINED ENV{ZLIB_LIB_RELEASE}) + set(ZLIB_LIB_RELEASE $ENV{ZLIB_LIB_RELEASE}) + endif() + set(ZLIB_CXX_FLAGS -DZLIB) set(ZLIB_LIBS debug ${ZLIB_LIB_DEBUG} optimized ${ZLIB_LIB_RELEASE}) @@ -136,6 +190,23 @@ else () message(STATUS "ZLIB library is disabled") endif () +if (DEFINED XPRESS) + set(USE_XPRESS ${XPRESS}) +else () + set(USE_XPRESS ${USE_XPRESS_DEFAULT}) +endif () + +if (${USE_XPRESS} EQUAL 1) + message(STATUS "XPRESS is enabled") + + add_definitions(-DXPRESS) + + # We are using the implementation provided by the system + set (SYSTEM_LIBS ${SYSTEM_LIBS} Cabinet.lib) +else () + message(STATUS "XPRESS is disabled") +endif () + # # Edit these 4 lines to define paths to Jemalloc # @@ -156,13 +227,38 @@ endif () if (${USE_JEMALLOC} EQUAL 1) message(STATUS "JEMALLOC library is enabled") - set(JEMALLOC_CXX_FLAGS -DJEMALLOC) + set(JEMALLOC_CXX_FLAGS "-DJEMALLOC -DJEMALLOC_EXPORT= ") + + if(DEFINED ENV{JEMALLOC_INCLUDE}) + set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE}) + endif() + + if(DEFINED ENV{JEMALLOC_LIB_DEBUG}) + set(JEMALLOC_LIB_DEBUG $ENV{JEMALLOC_LIB_DEBUG}) + endif() + + if(DEFINED ENV{JEMALLOC_LIB_RELEASE}) + set(JEMALLOC_LIB_RELEASE $ENV{JEMALLOC_LIB_RELEASE}) + endif() + set(JEMALLOC_LIBS debug ${JEMALLOC_LIB_DEBUG} optimized ${JEMALLOC_LIB_RELEASE}) add_definitions(${JEMALLOC_CXX_FLAGS}) include_directories(${JEMALLOC_INCLUDE}) set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS}) set (ARTIFACT_SUFFIX "_je") + + set(USE_JENONINIT USE_JENONINIT_DEFAULT) + + if(JENONINIT) + set(USE_JENONINIT ${JENONINIT}) + endif() + + if(${USE_JENONINIT} EQUAL 1) + add_definitions(-DJEMALLOC_NON_INIT) + message(STATUS "JEMALLOC NONINIT version") + endif() + else () set (ARTIFACT_SUFFIX "") message(STATUS "JEMALLOC library is disabled") diff --git a/external/rocksdb/tools/auto_sanity_test.sh b/external/rocksdb/tools/auto_sanity_test.sh index bece681d24..07d444efb1 100644 --- a/external/rocksdb/tools/auto_sanity_test.sh +++ b/external/rocksdb/tools/auto_sanity_test.sh @@ -64,11 +64,11 @@ echo "Creating db based on the old commit --- $commit_old" ./old_db_sanity_test $dir_old create echo "=============================================================" -echo "[Backward Compability Check]" +echo "[Backward Compatibility Check]" echo "Verifying old db $dir_old using the new commit --- $commit_new" ./new_db_sanity_test $dir_old verify if [ $? -ne 0 ]; then - echo "[ERROR] Backward Compability Check fails:" + echo "[ERROR] Backward Compatibility Check fails:" echo " Verification of $dir_old using commit $commit_new failed." exit 2 fi @@ -78,7 +78,7 @@ echo "[Forward Compatibility Check]" echo "Verifying new db $dir_new using the old commit --- $commit_old" ./old_db_sanity_test $dir_new verify if [ $? -ne 0 ]; then - echo "[ERROR] Forward Compability Check fails:" + echo "[ERROR] Forward Compatibility Check fails:" echo " $dir_new using commit $commit_old failed." exit 2 fi diff --git a/external/rocksdb/tools/benchmark.sh b/external/rocksdb/tools/benchmark.sh index 3c862fd152..5298bbe07c 100644 --- a/external/rocksdb/tools/benchmark.sh +++ b/external/rocksdb/tools/benchmark.sh @@ -4,7 +4,15 @@ if [ $# -ne 1 ]; then echo -n "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/" echo "readrandom/readwhilewriting/readwhilemerging/updaterandom/" - echo "mergerandom/randomtransaction]" + echo "mergerandom/randomtransaction/compact]" + exit 0 +fi + +# Make it easier to run only the compaction test. Getting valid data requires +# a number of iterations and having an ability to run the test separately from +# rest of the benchmarks helps. +if [ "$COMPACTION_TEST" == "1" -a "$1" != "universal_compaction" ]; then + echo "Skipping $1 because it's not a compaction test." exit 0 fi @@ -37,15 +45,16 @@ if [ ! -z $DB_BENCH_NO_SYNC ]; then fi num_threads=${NUM_THREADS:-16} -# Only for *whilewriting, *whilemerging -writes_per_second=${WRITES_PER_SECOND:-$((10 * K))} +mb_written_per_sec=${MB_WRITE_PER_SEC:-0} # Only for tests that do range scans num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10} cache_size=${CACHE_SIZE:-$((1 * G))} +compression_max_dict_bytes=${COMPRESSION_MAX_DICT_BYTES:-0} +compression_type=${COMPRESSION_TYPE:-snappy} duration=${DURATION:-0} num_keys=${NUM_KEYS:-$((1 * G))} -key_size=20 +key_size=${KEY_SIZE:-20} value_size=${VALUE_SIZE:-400} block_size=${BLOCK_SIZE:-8192} @@ -61,17 +70,18 @@ const_params=" --block_size=$block_size \ --cache_size=$cache_size \ --cache_numshardbits=6 \ - --compression_type=snappy \ - --min_level_to_compress=3 \ + --compression_max_dict_bytes=$compression_max_dict_bytes \ --compression_ratio=0.5 \ + --compression_type=$compression_type \ --level_compaction_dynamic_level_bytes=true \ --bytes_per_sync=$((8 * M)) \ --cache_index_and_filter_blocks=0 \ + --pin_l0_filter_and_index_blocks_in_cache=1 \ + --benchmark_write_rate_limit=$(( 1024 * 1024 * $mb_written_per_sec )) \ \ --hard_rate_limit=3 \ --rate_limit_delay_max_milliseconds=1000000 \ --write_buffer_size=$((128 * M)) \ - --max_write_buffer_number=8 \ --target_file_size_base=$((128 * M)) \ --max_bytes_for_level_base=$((1 * G)) \ \ @@ -98,17 +108,48 @@ if [ $duration -gt 0 ]; then const_params="$const_params --duration=$duration" fi -params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=7" -params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=7 \ +params_w="$const_params \ + $l0_config \ + --max_background_compactions=16 \ + --max_write_buffer_number=8 \ + --max_background_flushes=7" + +params_bulkload="$const_params \ + --max_background_compactions=16 \ + --max_write_buffer_number=8 \ + --max_background_flushes=7 \ --level0_file_num_compaction_trigger=$((10 * M)) \ --level0_slowdown_writes_trigger=$((10 * M)) \ --level0_stop_writes_trigger=$((10 * M))" +# +# Tune values for level and universal compaction. +# For universal compaction, these level0_* options mean total sorted of runs in +# LSM. In level-based compaction, it means number of L0 files. +# +params_level_compact="$const_params \ + --max_background_flushes=4 \ + --max_write_buffer_number=4 \ + --level0_file_num_compaction_trigger=4 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=20" + +params_univ_compact="$const_params \ + --max_background_flushes=4 \ + --max_write_buffer_number=4 \ + --level0_file_num_compaction_trigger=8 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=20" + function summarize_result { test_out=$1 test_name=$2 bench_name=$3 + # Note that this function assumes that the benchmark executes long enough so + # that "Compaction Stats" is written to stdout at least once. If it won't + # happen then empty output from grep when searching for "Sum" will cause + # syntax errors. uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' ) stall_time=$( grep "^Cumulative stall" $test_out | tail -1 | awk '{ print $3 }' ) stall_pct=$( grep "^Cumulative stall" $test_out| tail -1 | awk '{ print $5 }' ) @@ -158,9 +199,116 @@ function run_bulkload { eval $cmd } +# +# Parameter description: +# +# $1 - 1 if I/O statistics should be collected. +# $2 - compaction type to use (level=0, universal=1). +# $3 - number of subcompactions. +# $4 - number of maximum background compactions. +# +function run_manual_compaction_worker { + # This runs with a vector memtable and the WAL disabled to load faster. + # It is still crash safe and the client can discover where to restart a + # load after a crash. I think this is a good way to load. + echo "Bulk loading $num_keys random keys for manual compaction." + + fillrandom_output_file=$output_dir/benchmark_man_compact_fillrandom_$3.log + man_compact_output_log=$output_dir/benchmark_man_compact_$3.log + + if [ "$2" == "1" ]; then + extra_params=$params_univ_compact + else + extra_params=$params_level_compact + fi + + # Make sure that fillrandom uses the same compaction options as compact. + cmd="./db_bench --benchmarks=fillrandom \ + --use_existing_db=0 \ + --disable_auto_compactions=0 \ + --sync=0 \ + $extra_params \ + --threads=$num_threads \ + --compaction_measure_io_stats=$1 \ + --compaction_style=$2 \ + --subcompactions=$3 \ + --memtablerep=vector \ + --disable_wal=1 \ + --max_background_compactions=$4 \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $fillrandom_output_file" + + echo $cmd | tee $fillrandom_output_file + eval $cmd + + summarize_result $fillrandom_output_file man_compact_fillrandom_$3 fillrandom + + echo "Compacting with $3 subcompactions specified ..." + + # This is the part we're really interested in. Given that compact benchmark + # doesn't output regular statistics then we'll just use the time command to + # measure how long this step takes. + cmd="{ \ + time ./db_bench --benchmarks=compact \ + --use_existing_db=1 \ + --disable_auto_compactions=0 \ + --sync=0 \ + $extra_params \ + --threads=$num_threads \ + --compaction_measure_io_stats=$1 \ + --compaction_style=$2 \ + --subcompactions=$3 \ + --max_background_compactions=$4 \ + ;} + 2>&1 | tee -a $man_compact_output_log" + + echo $cmd | tee $man_compact_output_log + eval $cmd + + # Can't use summarize_result here. One way to analyze the results is to run + # "grep real" on the resulting log files. +} + +function run_univ_compaction { + # Always ask for I/O statistics to be measured. + io_stats=1 + + # Values: kCompactionStyleLevel = 0x0, kCompactionStyleUniversal = 0x1. + compaction_style=1 + + # Define a set of benchmarks. + subcompactions=(1 2 4 8 16) + max_background_compactions=(16 16 8 4 2) + + i=0 + total=${#subcompactions[@]} + + # Execute a set of benchmarks to cover variety of scenarios. + while [ "$i" -lt "$total" ] + do + run_manual_compaction_worker $io_stats $compaction_style ${subcompactions[$i]} \ + ${max_background_compactions[$i]} + ((i++)) + done +} + function run_fillseq { - # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the - # client can discover where to restart a load after a crash. I think this is a good way to load. + # This runs with a vector memtable. WAL can be either disabled or enabled + # depending on the input parameter (1 for disabled, 0 for enabled). The main + # benefit behind disabling WAL is to make loading faster. It is still crash + # safe and the client can discover where to restart a load after a crash. I + # think this is a good way to load. + + # Make sure that we'll have unique names for all the files so that data won't + # be overwritten. + if [ $1 == 1 ]; then + log_file_name=$output_dir/benchmark_fillseq.wal_disabled.v${value_size}.log + test_name=fillseq.wal_disabled.v${value_size} + else + log_file_name=$output_dir/benchmark_fillseq.wal_enabled.v${value_size}.log + test_name=fillseq.wal_enabled.v${value_size} + fi + echo "Loading $num_keys keys sequentially" cmd="./db_bench --benchmarks=fillseq \ --use_existing_db=0 \ @@ -169,12 +317,14 @@ function run_fillseq { --min_level_to_compress=0 \ --threads=1 \ --memtablerep=vector \ - --disable_wal=1 \ + --disable_wal=$1 \ --seed=$( date +%s ) \ - 2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log" - echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log + 2>&1 | tee -a $log_file_name" + echo $cmd | tee $log_file_name eval $cmd - summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq + + # The constant "fillseq" which we pass to db_bench is the benchmark name. + summarize_result $log_file_name $test_name fillseq } function run_change { @@ -231,7 +381,6 @@ function run_readwhile { --sync=$syncval \ $params_w \ --threads=$num_threads \ - --writes_per_second=$writes_per_second \ --merge_operator=\"put\" \ --seed=$( date +%s ) \ 2>&1 | tee -a $output_dir/${out_name}" @@ -251,7 +400,6 @@ function run_rangewhile { --sync=$syncval \ $params_w \ --threads=$num_threads \ - --writes_per_second=$writes_per_second \ --merge_operator=\"put\" \ --seek_nexts=$num_nexts_per_seek \ --reverse_iterator=$reverse_arg \ @@ -312,8 +460,10 @@ for job in ${jobs[@]}; do start=$(now) if [ $job = bulkload ]; then run_bulkload - elif [ $job = fillseq ]; then - run_fillseq + elif [ $job = fillseq_disable_wal ]; then + run_fillseq 1 + elif [ $job = fillseq_enable_wal ]; then + run_fillseq 0 elif [ $job = overwrite ]; then run_change overwrite elif [ $job = updaterandom ]; then @@ -342,6 +492,8 @@ for job in ${jobs[@]}; do run_rangewhile merging $job true elif [ $job = randomtransaction ]; then run_randomtransaction + elif [ $job = universal_compaction ]; then + run_univ_compaction elif [ $job = debug ]; then num_keys=1000; # debug echo "Setting num_keys to $num_keys" @@ -355,7 +507,7 @@ for job in ${jobs[@]}; do echo "Complete $job in $((end-start)) seconds" | tee -a $schedule fi - echo -e "ops/sec\tmb/sec\tSize-GB\tL0_MB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest" + echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest" tail -1 $output_dir/report.txt done diff --git a/external/rocksdb/tools/check_format_compatible.sh b/external/rocksdb/tools/check_format_compatible.sh index 65bbe0b903..e3353bfaeb 100644 --- a/external/rocksdb/tools/check_format_compatible.sh +++ b/external/rocksdb/tools/check_format_compatible.sh @@ -4,7 +4,7 @@ # ./ldb needs to be avaible to be executed. # # Usage: