Skip to content

Commit d7646ec

Browse files
author
Andre Mueller
committed
improved DB building & documentation
- parallel building of CPU databases - automatic partitioning of CPU databases with size limit - support for letters 'U' and 'u' for RNA mapping - added script to generate Krona plots from abundance results - updated documentation - cleaned up includes - some code styling changes - changed file extension of header files to '.hpp' - binary database format remains unchanged!
1 parent d233228 commit d7646ec

99 files changed

Lines changed: 6163 additions & 5714 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,18 @@
2020
/release
2121
/seafile
2222
/analysis
23+
/experiments
2324
/results
24-
/.settings
25+
/Results
26+
/Krona
27+
/krona
2528
/dev/html
2629
/test/results
2730
/test/data
2831
/test/taxonomy
2932
/test/build_*
3033
!/test/*.gz
34+
.settings
3135
.project
3236
.cproject
3337
.vscode

Makefile

Lines changed: 63 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
# Copyright 2016-2026, André Müller (github.com/muellan),
2+
# Robin Kobus (github.com/funatiq)
3+
#
4+
# This file is part of the MetaCache taxonomic sequence classification tool.
5+
#
6+
# MetaCache is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU General Public License as published by
8+
# the Free Software Foundation, either version 3 of the License, or
9+
# (at your option) any later version.
10+
#
11+
# MetaCache is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
# GNU General Public License for more details.
15+
#
16+
# You should have received a copy of the GNU General Public License
17+
# along with MetaCache. If not, see <http://www.gnu.org/licenses/>.
18+
119
REL_ARTIFACT = metacache
220
DBG_ARTIFACT = metacache_debug
321
PRF_ARTIFACT = metacache_prf
@@ -34,52 +52,52 @@ endif
3452

3553
#--------------------------------------------------------------------
3654
HEADERS = \
37-
src/alignment.h \
38-
src/batch_processing.h \
39-
src/bitmanip.h \
40-
src/building.h \
41-
src/candidate_generation.h \
42-
src/candidate_structs.h \
43-
src/chunk_allocator.h \
44-
src/classification.h \
45-
src/classification_statistics.h \
46-
src/cmdline_utility.h \
47-
src/config.h \
48-
src/database.h \
49-
src/database_query.h \
50-
src/dna_encoding.h \
51-
src/filesys_utility.h \
55+
src/alignment.hpp \
56+
src/batch_processing.hpp \
57+
src/bitmanip.hpp \
58+
src/building.hpp \
59+
src/candidate_generation.hpp \
60+
src/candidate_structs.hpp \
61+
src/chunk_allocator.hpp \
62+
src/classification.hpp \
63+
src/classification_statistics.hpp \
64+
src/cmdline_utility.hpp \
65+
src/config.hpp \
66+
src/database.hpp \
67+
src/database_query.hpp \
68+
src/dna_encoding.hpp \
69+
src/filesys_utility.hpp \
5270
src/gpu_hashmap.cuh \
5371
src/gpu_hashmap_operations.cuh \
5472
src/gpu_result_processing.cuh \
55-
src/hash_dna.h \
56-
src/hash_int.h \
57-
src/hash_multimap.h \
58-
src/host_hashmap.h \
59-
src/io_error.h \
60-
src/io_options.h \
61-
src/io_serialize.h \
62-
src/matches_per_target.h \
63-
src/modes.h \
64-
src/options.h \
65-
src/printing.h \
73+
src/hash_dna.hpp \
74+
src/hash_int.hpp \
75+
src/hash_multimap.hpp \
76+
src/host_hashmap.hpp \
77+
src/io_error.hpp \
78+
src/io_options.hpp \
79+
src/io_serialize.hpp \
80+
src/matches_per_target.hpp \
81+
src/modes.hpp \
82+
src/options.hpp \
83+
src/printing.hpp \
6684
src/query_batch.cuh \
67-
src/query_handler.h \
68-
src/querying.h \
85+
src/query_handler.hpp \
86+
src/querying.hpp \
6987
src/sequence_batch.cuh \
70-
src/sequence_io.h \
71-
src/sequence_iostream.h \
72-
src/sequence_view.h \
73-
src/span.h \
88+
src/sequence_io.hpp \
89+
src/sequence_iostream.hpp \
90+
src/sequence_view.hpp \
91+
src/span.hpp \
7492
src/stat_combined.cuh \
75-
src/stat_combined.h \
76-
src/stat_confusion.h \
77-
src/stat_moments.h \
78-
src/string_utils.h \
79-
src/taxonomy.h \
80-
src/taxonomy_io.h \
81-
src/timer.h \
82-
src/version.h
93+
src/stat_combined.hpp \
94+
src/stat_confusion.hpp \
95+
src/stat_moments.hpp \
96+
src/string_utils.hpp \
97+
src/taxonomy.hpp \
98+
src/taxonomy_io.hpp \
99+
src/timer.hpp \
100+
src/version.hpp
83101

84102
SOURCES = \
85103
src/building.cpp \
@@ -208,7 +226,7 @@ $(ARTIFACT): $(OBJS)
208226
$(CUDA_ARTIFACT): $(OBJS) $(CUDA_OBJS)
209227
$(CUDA_COMPILER) -o $(CUDA_ARTIFACT) $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
210228

211-
$(DIR)/main.o : src/main.cpp src/modes.h
229+
$(DIR)/main.o : src/main.cpp src/modes.hpp
212230
$(COMPILE)
213231

214232
$(DIR)/building.o : src/building.cpp $(HEADERS)
@@ -247,16 +265,16 @@ $(DIR)/database.o : src/database.cpp $(HEADERS)
247265
$(DIR)/options.o : src/options.cpp $(HEADERS)
248266
$(COMPILE)
249267

250-
$(DIR)/mode_help.o : src/mode_help.cpp src/modes.h src/filesys_utility.h
268+
$(DIR)/mode_help.o : src/mode_help.cpp src/modes.hpp src/filesys_utility.hpp
251269
$(COMPILE)
252270

253-
$(DIR)/sequence_io.o : src/sequence_io.cpp src/sequence_io.h src/io_error.h src/sequence_iostream.h
271+
$(DIR)/sequence_io.o : src/sequence_io.cpp src/sequence_io.hpp src/io_error.hpp src/sequence_iostream.hpp
254272
$(COMPILE)
255273

256-
$(DIR)/filesys_utility.o : src/filesys_utility.cpp src/filesys_utility.h
274+
$(DIR)/filesys_utility.o : src/filesys_utility.cpp src/filesys_utility.hpp
257275
$(COMPILE)
258276

259-
$(DIR)/cmdline_utility.o : src/cmdline_utility.cpp src/cmdline_utility.h
277+
$(DIR)/cmdline_utility.o : src/cmdline_utility.cpp src/cmdline_utility.hpp
260278
$(COMPILE)
261279

262280
$(DIR)/gpu_hashmap.o : src/gpu_hashmap.cu $(HEADERS)

README.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
# MetaCache
22

3-
[![Linux build status](https://travis-ci.org/muellan/metacache.svg?branch=master)](https://travis-ci.org/muellan/metacache)
4-
53
MetaCache is a classification system for mapping genomic sequences (short reads, long reads, contigs, ...) from metagenomic samples to their most likely taxon of origin. MetaCache aims to reduce the memory requirement usually associated with k-mer based methods while retaining their speed. It uses locality sensitive hashing to quickly identify candidate regions within one or multiple reference genomes. A read is then classified based on the similarity to those regions.
64

7-
For an independend comparison to other tools in terms of classification accuracy see the [LEMMI](https://lemmi.ezlab.org) benchmarking site.
8-
9-
**MetaCache's CPU version** classifies around 60 Million reads (of length 100) per minute against all complete bacterial, viral and archaea genomes from NCBI RefSeq Release 97 running with 88 threads on a workstation with 2 Intel(R) Xeon(R) Gold 6238 CPUs.
5+
**MetaCache's CPU version** classifies around 110 million reads (of length 130bp) per minute against all complete bacterial, viral and archaea genomes from NCBI RefSeq Release 222 running with 128 threads on a workstation with an AMD Epyc 7713P 64-core CPU or 20 million long reads (200bp-19000bp, median 480bp) per minute on the same platform with the same database.
106

117
**MetaCache's [GPU version](docs/gpu_version.md)** classifies around 300 Million reads (of length 100) per minute against all complete bacterial, viral, fungal and archaea genomes from NCBI RefSeq Release 202 running on a workstation with 4 NVIDIA(R) Tesla(R) V100 GPUs (32 GB model).
128
[**MetaCache-GPU**](https://arxiv.org/abs/2106.08150) was presented at ICPP '21.
9+
Database build times are up to 100 times faster on the GPU and are typically on the order of a few seconds to a
10+
minute even for 100GB+ databases!
11+
MetaCache GPU has been successfully used to build and query (partitioned) databases of 1000s of eukaryotic genomes with a total size of multiple terabytes.
1312

13+
**[All-Food-Seq](docs/afs.md)** shows how MetaCache can be used for shotgun sequencing based analysis of foodstuff
14+
with large reference databases comprised of various eukaryotic and microbial genomes.
1415

1516

1617

@@ -31,7 +32,7 @@ This will
3132
* download the complete bacterial, viral and archaea genomes from the latest NCBI RefSeq release (this can take some time)
3233
* build a classification database
3334

34-
Once the default database is built you can classify reads:
35+
Once the refseq database is built you can classify reads:
3536
```
3637
./metacache query refseq myReads.fa -out results.txt
3738
./metacache query refseq anyFolderWithFastaOrFastqFiles -out results.txt
@@ -60,7 +61,7 @@ MetaCache 2.0.0 was successfully tested on the following platforms (all 64 bit +
6061
- Ubuntu 20.04 with g++ 5.4, g++ 7.4
6162
- Windows 10 20H2 running Ubuntu 20.04 inside WSL2 and g++ 10.3
6263

63-
In order to be able to build the default database (based on NCBI RefSeq Release 97) with default settings your system should have around 64GB of RAM (note that the NCBI RefSeq will still be growing in the near future).
64+
In order to be able to build a database based on NCBI RefSeq with default settings your system should have at least 128GB of RAM as of RefSeq Release 220 (note that the NCBI RefSeq will still be growing in the near future).
6465
If you don't have enough RAM, you can use [database partitioning](docs/partitioning.md).
6566

6667

@@ -152,12 +153,13 @@ If you *don't* have the zlib compression library installed and/or want *don't* w
152153
## Building Databases
153154

154155

155-
#### Building the Default RefSeq Database
156+
#### Building a RefSeq-based Database
156157

157158
Use the `metacache-build-refseq` script to build a MetaCache database based on complete bacterial, viral and archaea genomes from the latest NCBI RefSeq release. Note that the genomes will be downloaded first, which can take some time.
158159
The database files are put into the folder `genomes` in the current working directory.
159160

160161
### [Building Custom Databases...](docs/building.md)
162+
### [Building Partitioned Databases...](docs/partitioning.md)
161163

162164

163165

@@ -184,6 +186,9 @@ Once a database (e.g. the standard 'refseq'), is built you can classify reads.
184186

185187
### [Classification Output Interpretation, Analysis & Formatting Options...](docs/output.md)
186188

189+
### [Generate Krona Plots From Abundance Results](docs/krona_plots.md)
190+
191+
187192

188193

189194

@@ -211,6 +216,7 @@ or jump directly to a mode's man page with:
211216

212217

213218

219+
214220
MetaCache Copyright (C) 2016-2025 [André Müller](https://github.com/muellan) & [Robin Kobus](https://github.com/Funatiq)
215221
This program comes with ABSOLUTELY NO WARRANTY.
216222
This is free software, and you are welcome to redistribute it under certain conditions. See the file 'LICENSE' for details.

dep/clipp.h renamed to dep/clipp.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,9 +1406,7 @@ class action_provider
14061406
//---------------------------------------------------------------
14071407
/** @brief executes all argument actions */
14081408
void execute_actions(const arg_string& arg) const {
1409-
int i = 0;
14101409
for(const auto& a : argActions_) {
1411-
++i;
14121410
a(arg.c_str());
14131411
}
14141412
}

dev/coding_guidelines.md

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,16 @@ Project
77
- DON'T introduce breaking changes to the (default) output format
88
- try to avoid breaking database format changes
99
- define all entities within namespace ```mc```
10+
- for modes 'query', 'merge', etc. that produce classification output
11+
- write classification results either to dedicated file or stdout
12+
- write all other status information, error messages, etc. to stderr
1013

1114

1215
C++ Language Usage
1316
------------------
14-
- C++14
17+
- C++14 only for now
1518

16-
- NO C++17 features (yet)
17-
18-
- NO **owning** raw pointers
19+
- NO **owning** raw pointers outside of data structures
1920
=> Every resource must be cleaned up if it's owner is destroyed.
2021

2122
- NO explicit ```new``` and ```delete```
@@ -26,32 +27,36 @@ C++ Language Usage
2627
- DON'T use ```#pragma once```, use include guards
2728
- DON'T use ```typedef A B;```, use ```using B = A;```
2829

29-
- DON'T return meaningless pairs or tuples from functions
30+
- DON'T return meaningless pairs or tuples from functions,
31+
use sensibly named structs with meaningfully named members instead
3032
- DON'T use **unscoped** enums, use **scoped** enums: ```enum class { ... }```
3133
- avoid functions with more than 5 parameters
3234
- avoid out-parameters (non-const reference function parameters)
3335

34-
- std::vector should be your default container choice
36+
- std::vector should be the default container choice
3537
- prefer std::unordered_map/set over std::map/set
3638
- prefer range-based for loops
3739
- prefer std:: algorithms
38-
- prefer free standing functions (over member functions)
40+
- prefer free standing functions over member functions
3941
- prefer templates over std::function
42+
- prefer concrete types/functions over generic ones
4043

4144

4245
Coding Style
4346
------------------
4447
- indentation: 4 SPACES, NO Tabs
4548
- try to keep line lengths under 80-100 characters
46-
- file extensions: headers: ".h", TUs: ".cpp"
49+
- file extensions: headers: ".hpp", TUs: ".cpp"
50+
- if the body of a loop, an if statement etc. is on a separate line
51+
from the loop head, if condition, etc. then put the body into curly braces
4752
- naming:
4853
- localVariables
4954
- memberVariables_
5055
- function_names (should be verbs)
5156
- class_names (should be nouns)
5257
- TemplateParameters
5358
- DON'T use "_" at the begginning of names;
54-
these are reserved for std:: library entities.
59+
these are reserved for std:: library entities or compiler intrinsics.
5560

5661
```cpp
5762
#include <vector>

docs/afs.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
# AFS-MetaCache: Food Ingredient Detection & Abundance Analysis
1+
# AFS-MetaCache2: Food Ingredient Detection & Abundance Analysis
22

33
MetaCache is a classification system for mapping (short or long) reads from metagenomic samples to their most likely taxon of origin. It uses locality sensitive hashing to quickly identify candidate regions within one or multiple reference genomes. A read is then classified based on the similarity to those regions.
44

5-
6-
* [MetaCache Github Repository](https://github.com/muellan/metacache)
7-
* [**MetaCacheSpark**: Apache Spark&trade; implementation of MetaCache for big data clusters](https://github.com/jmabuin/MetaCacheSpark)
5+
[MetaCache Github Repository](https://github.com/muellan/metacache)
86

97

108

@@ -40,10 +38,12 @@ metacache build afs genomes_folder \
4038

4139
It is important that you supply the option `-remove-overpopulated-features` if you add large eukaryotic genomes. This will improve classification accuracy and runtime performance.
4240

41+
In case your workstation memory is not enough to fit the entire database, MetaCache allows partitioning into several smaller databases. These can be queried independently and the results can be merged to obtain final classifications.
42+
4343

4444
#### For more information see
45-
* [Building custom databases](building.md)
4645
* [Using partitioned databases](partitioning.md)
46+
* [Building custom databases](building.md)
4747

4848

4949

@@ -68,6 +68,9 @@ If the option `-split-out` is given, mapping and abundance results will be writt
6868
for optimal results. This tells MetaCache to consider the 4 best matching candidates per read (the default is 2, which is fine for bacteria). It also makes sure that the best matching candidate wins over the lowest common ancestor of all candidates if the input read has at least 80% features more in common with this best candidate than it has with the 2nd best.
6969

7070

71+
##### [Generate Krona Plots From Abundance Results](docs/krona_plots.md)
72+
73+
7174

7275
## More Documentation
7376

docs/building.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ You need
1919

2020

2121
If your machine doesn't have enough RAM to fit an entire database you can use [database partitioning](partitioning.md) to split up the reference genomes into several partitions.
22+
As of version 2.6.0, a faster parallel database construction algorithm is used, however this can sometimes lead to slightly higher memory consumption. To switch back to the old construction scheme, you can supply command line option `-threads 1`.
2223

2324

2425
## Taxonomic Hierarchy

0 commit comments

Comments
 (0)