Raptor 3.0.1
A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences
 
Loading...
Searching...
No Matches
build_data.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <atomic>
16#include <seqan3/std/new>
17
20
21namespace raptor::hibf
22{
23
24template <seqan3::data_layout data_layout_mode>
26{
27 alignas(std::hardware_destructive_interference_size) std::atomic<size_t> ibf_number{};
28 alignas(std::hardware_destructive_interference_size) std::atomic<size_t> user_bin_number{};
29
30 size_t number_of_user_bins{};
31 size_t number_of_ibfs{};
32
33 lemon::ListDigraph ibf_graph{};
34 lemon::ListDigraph::NodeMap<node_data> node_map{ibf_graph};
35
37 std::vector<double> fp_correction{};
38
39 size_t request_ibf_idx()
40 {
41 return std::atomic_fetch_add(&ibf_number, 1u);
42 }
43
44 size_t request_user_bin_idx()
45 {
46 return std::atomic_fetch_add(&user_bin_number, 1u);
47 }
48
49 void resize()
50 {
51 hibf.ibf_vector.resize(number_of_ibfs);
52 hibf.user_bins.set_ibf_count(number_of_ibfs);
53 hibf.user_bins.set_user_bin_count(number_of_user_bins);
54 hibf.next_ibf_id.resize(number_of_ibfs);
55 }
56
60 void compute_fp_correction(size_t const requested_max_tb, size_t const num_hash_functions, double const desired_fpr)
61 {
62 fp_correction.resize(requested_max_tb + 1, 0.0);
63 fp_correction[1] = 1.0;
64
65 // std::log1p(arg) = std::log(1 + arg). More precise than std::log(1 + arg) if arg is close to zero.
66 double const numerator = std::log1p(-std::exp(std::log(desired_fpr) / num_hash_functions));
67
68 for (size_t split = 2u; split <= requested_max_tb; ++split)
69 {
70 double const log_target_fpr = std::log1p(-std::exp(std::log1p(-desired_fpr) / split));
71 fp_correction[split] = numerator / std::log1p(-std::exp(log_target_fpr / num_hash_functions));
72 assert(fp_correction[split] >= 1.0);
73 }
74 }
75};
76
77} // namespace raptor::hibf
T atomic_fetch_add(T... args)
The HIBF binning directory. A data structure that efficiently answers set-membership queries for mult...
Definition hierarchical_interleaved_bloom_filter.hpp:88
std::vector< ibf_t > ibf_vector
The individual interleaved Bloom filters.
Definition hierarchical_interleaved_bloom_filter.hpp:123
std::vector< std::vector< int64_t > > next_ibf_id
Stores for each bin in each IBF of the HIBF the ID of the next IBF.
Definition hierarchical_interleaved_bloom_filter.hpp:132
user_bins user_bins
The underlying user bins.
Definition hierarchical_interleaved_bloom_filter.hpp:135
T exp(T... args)
Provides raptor::hierarchical_interleaved_bloom_filter.
T log1p(T... args)
T log(T... args)
Must be first include.
Definition bin_size_in_bits.hpp:18
Provides raptor::hibf::node_data.
T resize(T... args)
Definition build_data.hpp:26
void compute_fp_correction(size_t const requested_max_tb, size_t const num_hash_functions, double const desired_fpr)
Precompute f_h factors that adjust the split bin size to prevent FPR inflation due to multiple testin...
Definition build_data.hpp:60