Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ namespace seqan::hibf
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
* | General | seqan::hibf::config::track_occupancy | false | |
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
* | Layout | seqan::hibf::config::alpha | 1.2 | |
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
Expand Down Expand Up @@ -243,11 +244,32 @@ struct config
* designated to contain any data. The resulting layout will be very similar to a layout with `tmax` set to `58`
* and no empty bins.
*
* Choosing a value larger than `0.0` will also enable the `track_occupancy` option.
*
* Value must be in range [0.0,1.0).
* Recommendation: default value (0.0). This option is not recommended for general use.
*/
double empty_bin_fraction{};

/*!\brief Track the amount of emplaced elements for each technical bin.
*
* An IBF can track how many elements were emplaced into each technical bin.
* This option can be useful for a dynamic index, or to compute the exact FPR for a technical bin.
*
* The occupancy of a technical bin `i` of IBF `ibf` can be accessed via `ibf.occupancy[i]`.
*
* For occupancy, emplacing an element means that a bit of the conceptual Bloom Filter representing the respective
* technical bin changes.
* For example, adding the same value multiple times to the same technical bin will not increase the occupancy.
* Likewise, if the respective bits for a value have already been set by previous emplacing operations, the
* occupancy will not increase.
*
* This option comes with a minor performance penalty for seqan::hibf::interleaved_bloom_filter::emplace.
*
* Recommendation: default value (false).
*/
bool track_occupancy{false};

/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
*
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
Expand Down Expand Up @@ -330,6 +352,8 @@ struct config
* * Not setting seqan::hibf::config::tmax, or setting it to `0`, results in a default tmax
* `std::ceil(std::sqrt(number_of_user_bins))` being used.
* * seqan::hibf::config::tmax is increased to the next multiple of 64.
* * Setting seqan::hibf::config::empty_bin_fraction to a value larger than `0.0` will also enable
* seqan::hibf::config::track_occupancy.
*/
void validate_and_set_defaults();

Expand All @@ -354,7 +378,7 @@ struct config
private:
friend class cereal::access;

static constexpr uint32_t version{2};
static constexpr uint32_t version{3u};

template <typename archive_t>
void serialize(archive_t & archive)
Expand All @@ -371,9 +395,16 @@ struct config
archive(CEREAL_NVP(sketch_bits));
archive(CEREAL_NVP(tmax));

if (parsed_version > 1u)
if (parsed_version >= 2u)
{
archive(CEREAL_NVP(empty_bin_fraction));

if (parsed_version >= 3u)
archive(CEREAL_NVP(track_occupancy));
else
track_occupancy = empty_bin_fraction != 0.0;
}

archive(CEREAL_NVP(alpha));
archive(CEREAL_NVP(max_rearrangement_ratio));
archive(CEREAL_NVP(disable_estimate_union));
Expand Down
2 changes: 1 addition & 1 deletion src/build/construct_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
seqan::hibf::interleaved_bloom_filter ibf{bin_count,
bin_size,
seqan::hibf::hash_function_count{data.config.number_of_hash_functions},
data.config.empty_bin_fraction > 0.0};
data.config.track_occupancy};

local_index_allocation_timer.stop();
data.index_allocation_timer += local_index_allocation_timer;
Expand Down
3 changes: 3 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ void config::validate_and_set_defaults()
if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};

if (empty_bin_fraction != 0.0)
track_occupancy = true;

if (alpha < 0.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};

Expand Down
2 changes: 1 addition & 1 deletion src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins},
seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)},
seqan::hibf::hash_function_count{configuration.number_of_hash_functions},
configuration.empty_bin_fraction > 0.0}
configuration.track_occupancy}
{
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);

Expand Down
2 changes: 2 additions & 0 deletions test/snippet/hibf/hibf_construction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ int main()
.threads = 1, // recommended to adapt
.sketch_bits = 12,
.tmax = 0, // triggers default copmutation
.empty_bin_fraction = 0.0,
.track_occupancy = false,
.alpha = 1.2,
.max_rearrangement_ratio = 0.5,
.disable_estimate_union = false,
Expand Down
69 changes: 66 additions & 3 deletions test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ TEST(config_test, write_to)
std::string const expected_file{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 2,\n"
"@ \"version\": 3,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
Expand All @@ -46,6 +46,7 @@ TEST(config_test, write_to)
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.0,\n"
"@ \"track_occupancy\": false,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
Expand All @@ -62,7 +63,7 @@ TEST(config_test, read_from)
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 2,\n"
"@ \"version\": 3,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
Expand All @@ -71,6 +72,7 @@ TEST(config_test, read_from)
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.5,\n"
"@ \"track_occupancy\": true,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
Expand All @@ -90,6 +92,7 @@ TEST(config_test, read_from)
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.5);
EXPECT_EQ(configuration.track_occupancy, true);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
Expand Down Expand Up @@ -134,6 +137,46 @@ TEST(config_test, read_from_v1)
EXPECT_EQ(configuration.disable_rearrangement, false);
}

TEST(config_test, read_from_v2)
{
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 2,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.5,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
"@ \"disable_rearrangement\": false\n"
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"};

seqan::hibf::config configuration;
configuration.read_from(ss);

EXPECT_EQ(configuration.number_of_user_bins, 123456789);
EXPECT_EQ(configuration.number_of_hash_functions, 4);
EXPECT_EQ(configuration.maximum_fpr, 0.0001);
EXPECT_EQ(configuration.relaxed_fpr, 0.3);
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.5);
EXPECT_EQ(configuration.track_occupancy, true);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
EXPECT_EQ(configuration.disable_rearrangement, false);
}

TEST(config_test, read_from_with_more_meta)
{
std::stringstream ss{"@blah some chopper stuff\n"
Expand All @@ -144,14 +187,16 @@ TEST(config_test, read_from_with_more_meta)
"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 1,\n"
"@ \"version\": 3,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.0,\n"
"@ \"track_occupancy\": true,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
Expand All @@ -170,6 +215,8 @@ TEST(config_test, read_from_with_more_meta)
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.0);
EXPECT_EQ(configuration.track_occupancy, true);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
Expand Down Expand Up @@ -349,6 +396,20 @@ TEST(config_test, validate_and_set_defaults)
"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0).");
}

// empty_bin_fraction != 0.0 also enables tracking occupancy
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn,
.number_of_user_bins = 1u,
.empty_bin_fraction = 0.0,
.track_occupancy = false};
configuration.validate_and_set_defaults();
EXPECT_EQ(configuration.track_occupancy, false);

configuration.empty_bin_fraction = 0.3;
configuration.validate_and_set_defaults();
EXPECT_EQ(configuration.track_occupancy, true);
}

// alpha must be positive
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1};
Expand Down Expand Up @@ -413,6 +474,8 @@ TEST(config_test, serialisation)
.threads = 31,
.sketch_bits = 8,
.tmax = 128,
.empty_bin_fraction = 0.13,
.track_occupancy = true,
.alpha = 1.0,
.max_rearrangement_ratio = 0.333,
.disable_estimate_union = true,
Expand Down