muellan
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 63 additions & 45 deletions b/‎Makefile‎
Lines changed: 63 additions & 45 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 8 deletions b/‎README.md‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎dep/clipp.h‎ ‎dep/clipp.hpp‎dep/clipp.h renamed to dep/clipp.hpp
Lines changed: 0 additions & 2 deletions b/‎dep/clipp.h‎ ‎dep/clipp.hpp‎dep/clipp.h renamed to dep/clipp.hpp
Lines changed: 0 additions & 2 deletions
diff --git a/‎dev/coding_guidelines.md‎
Lines changed: 14 additions & 9 deletions b/‎dev/coding_guidelines.md‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎docs/afs.md‎
Lines changed: 8 additions & 5 deletions b/‎docs/afs.md‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎docs/building.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/building.md‎
Lines changed: 1 addition & 0 deletions
@@ -20,14 +20,18 @@
 /release
 /seafile
 /analysis
+/experiments
 /results
-/.settings
+/Results
+/Krona
+/krona
 /dev/html
 /test/results
 /test/data
 /test/taxonomy
 /test/build_*
 !/test/*.gz
+.settings
 .project
 .cproject
 .vscode
 
@@ -1,3 +1,21 @@
+# Copyright 2016-2026, André Müller (github.com/muellan),
+#                      Robin Kobus  (github.com/funatiq)
+#
+# This file is part of the MetaCache taxonomic sequence classification tool.
+#
+# MetaCache is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# MetaCache is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with MetaCache.  If not, see <http://www.gnu.org/licenses/>.
+
 REL_ARTIFACT      = metacache
 DBG_ARTIFACT      = metacache_debug
 PRF_ARTIFACT      = metacache_prf
@@ -34,52 +52,52 @@ endif
 
 #--------------------------------------------------------------------
 HEADERS = \
-          src/alignment.h \
-          src/batch_processing.h \
-          src/bitmanip.h \
-          src/building.h \
-          src/candidate_generation.h \
-          src/candidate_structs.h \
-          src/chunk_allocator.h \
-          src/classification.h \
-          src/classification_statistics.h \
-          src/cmdline_utility.h \
-          src/config.h \
-          src/database.h \
-          src/database_query.h \
-          src/dna_encoding.h \
-          src/filesys_utility.h \
+          src/alignment.hpp \
+          src/batch_processing.hpp \
+          src/bitmanip.hpp \
+          src/building.hpp \
+          src/candidate_generation.hpp \
+          src/candidate_structs.hpp \
+          src/chunk_allocator.hpp \
+          src/classification.hpp \
+          src/classification_statistics.hpp \
+          src/cmdline_utility.hpp \
+          src/config.hpp \
+          src/database.hpp \
+          src/database_query.hpp \
+          src/dna_encoding.hpp \
+          src/filesys_utility.hpp \
           src/gpu_hashmap.cuh \
           src/gpu_hashmap_operations.cuh \
           src/gpu_result_processing.cuh \
-          src/hash_dna.h \
-          src/hash_int.h \
-          src/hash_multimap.h \
-          src/host_hashmap.h \
-          src/io_error.h \
-          src/io_options.h \
-          src/io_serialize.h \
-          src/matches_per_target.h \
-          src/modes.h \
-          src/options.h \
-          src/printing.h \
+          src/hash_dna.hpp \
+          src/hash_int.hpp \
+          src/hash_multimap.hpp \
+          src/host_hashmap.hpp \
+          src/io_error.hpp \
+          src/io_options.hpp \
+          src/io_serialize.hpp \
+          src/matches_per_target.hpp \
+          src/modes.hpp \
+          src/options.hpp \
+          src/printing.hpp \
           src/query_batch.cuh \
-          src/query_handler.h \
-          src/querying.h \
+          src/query_handler.hpp \
+          src/querying.hpp \
           src/sequence_batch.cuh \
-          src/sequence_io.h \
-          src/sequence_iostream.h \
-          src/sequence_view.h \
-          src/span.h \
+          src/sequence_io.hpp \
+          src/sequence_iostream.hpp \
+          src/sequence_view.hpp \
+          src/span.hpp \
           src/stat_combined.cuh \
-          src/stat_combined.h \
-          src/stat_confusion.h \
-          src/stat_moments.h \
-          src/string_utils.h \
-          src/taxonomy.h \
-          src/taxonomy_io.h \
-          src/timer.h \
-          src/version.h
+          src/stat_combined.hpp \
+          src/stat_confusion.hpp \
+          src/stat_moments.hpp \
+          src/string_utils.hpp \
+          src/taxonomy.hpp \
+          src/taxonomy_io.hpp \
+          src/timer.hpp \
+          src/version.hpp
 
 SOURCES = \
           src/building.cpp \
@@ -208,7 +226,7 @@ $(ARTIFACT): $(OBJS)
 $(CUDA_ARTIFACT): $(OBJS) $(CUDA_OBJS)
 	$(CUDA_COMPILER) -o $(CUDA_ARTIFACT) $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
 
-$(DIR)/main.o : src/main.cpp src/modes.h
+$(DIR)/main.o : src/main.cpp src/modes.hpp
 	$(COMPILE)
 
 $(DIR)/building.o : src/building.cpp $(HEADERS)
@@ -247,16 +265,16 @@ $(DIR)/database.o : src/database.cpp $(HEADERS)
 $(DIR)/options.o : src/options.cpp $(HEADERS)
 	$(COMPILE)
 
-$(DIR)/mode_help.o : src/mode_help.cpp src/modes.h src/filesys_utility.h
+$(DIR)/mode_help.o : src/mode_help.cpp src/modes.hpp src/filesys_utility.hpp
 	$(COMPILE)
 
-$(DIR)/sequence_io.o : src/sequence_io.cpp src/sequence_io.h src/io_error.h src/sequence_iostream.h
+$(DIR)/sequence_io.o : src/sequence_io.cpp src/sequence_io.hpp src/io_error.hpp src/sequence_iostream.hpp
 	$(COMPILE)
 
-$(DIR)/filesys_utility.o : src/filesys_utility.cpp src/filesys_utility.h
+$(DIR)/filesys_utility.o : src/filesys_utility.cpp src/filesys_utility.hpp
 	$(COMPILE)
 
-$(DIR)/cmdline_utility.o : src/cmdline_utility.cpp src/cmdline_utility.h
+$(DIR)/cmdline_utility.o : src/cmdline_utility.cpp src/cmdline_utility.hpp
 	$(COMPILE)
 
 $(DIR)/gpu_hashmap.o : src/gpu_hashmap.cu $(HEADERS)
 
@@ -1,16 +1,17 @@
 # MetaCache
 
-[![Linux build status](https://travis-ci.org/muellan/metacache.svg?branch=master)](https://travis-ci.org/muellan/metacache)
-
 MetaCache is a classification system for mapping genomic sequences (short reads, long reads, contigs, ...) from metagenomic samples to their most likely taxon of origin. MetaCache aims to reduce the memory requirement usually associated with k-mer based methods while retaining their speed. It uses locality sensitive hashing to quickly identify candidate regions within one or multiple reference genomes. A read is then classified based on the similarity to those regions.
 
-For an independend comparison to other tools in terms of classification accuracy see the [LEMMI](https://lemmi.ezlab.org) benchmarking site.
-
-**MetaCache's CPU version** classifies around 60 Million reads (of length 100) per minute against all complete bacterial, viral and archaea genomes from NCBI RefSeq Release 97 running with 88 threads on a workstation with 2 Intel(R) Xeon(R) Gold 6238 CPUs.
+**MetaCache's CPU version** classifies around 110 million reads (of length 130bp) per minute against all complete bacterial, viral and archaea genomes from NCBI RefSeq Release 222 running with 128 threads on a workstation with an AMD Epyc 7713P 64-core CPU or 20 million long reads (200bp-19000bp, median 480bp) per minute on the same platform with the same database.
 
 **MetaCache's [GPU version](docs/gpu_version.md)** classifies around 300 Million reads (of length 100) per minute against all complete bacterial, viral, fungal and archaea genomes from NCBI RefSeq Release 202 running on a workstation with 4 NVIDIA(R) Tesla(R) V100 GPUs (32 GB model).
 [**MetaCache-GPU**](https://arxiv.org/abs/2106.08150) was presented at ICPP '21.
+Database build times are up to 100 times faster on the GPU and are typically on the order of a few seconds to a
+minute even for 100GB+ databases!
+MetaCache GPU has been successfully used to build and query (partitioned) databases of 1000s of eukaryotic genomes with a total size of multiple terabytes.
 
+**[All-Food-Seq](docs/afs.md)** shows how MetaCache can be used for shotgun sequencing based analysis of foodstuff 
+with large reference databases comprised of various eukaryotic and microbial genomes.
 
 
 
@@ -31,7 +32,7 @@ This will
   * download the complete bacterial, viral and archaea genomes from the latest NCBI RefSeq release (this can take some time)
   * build a classification database
 
-Once the default database is built you can classify reads:
+Once the refseq database is built you can classify reads:
   ```
   ./metacache query refseq myReads.fa -out results.txt
   ./metacache query refseq anyFolderWithFastaOrFastqFiles -out results.txt
@@ -60,7 +61,7 @@ MetaCache 2.0.0 was successfully tested on the following platforms (all 64 bit +
 - Ubuntu 20.04 with g++ 5.4, g++ 7.4
 - Windows 10 20H2 running Ubuntu 20.04 inside WSL2 and g++ 10.3
 
-In order to be able to build the default database (based on NCBI RefSeq Release 97) with default settings your system should have around 64GB of RAM (note that the NCBI RefSeq will still be growing in the near future).
+In order to be able to build a database based on NCBI RefSeq with default settings your system should have at least 128GB of RAM as of RefSeq Release 220 (note that the NCBI RefSeq will still be growing in the near future).
 If you don't have enough RAM, you can use [database partitioning](docs/partitioning.md).
 
 
@@ -152,12 +153,13 @@ If you *don't* have the zlib compression library installed and/or want *don't* w
 ## Building Databases
 
 
-#### Building the Default RefSeq Database
+#### Building a RefSeq-based Database
 
 Use the `metacache-build-refseq` script to build a MetaCache database based on complete bacterial, viral and archaea genomes from the latest NCBI RefSeq release. Note that the genomes will be downloaded first, which can take some time.
 The database files are put into the folder `genomes` in the current working directory.
 
 ### [Building Custom Databases...](docs/building.md)
+### [Building Partitioned Databases...](docs/partitioning.md)
 
 
 
@@ -184,6 +186,9 @@ Once a database (e.g. the standard 'refseq'), is built you can classify reads.
 
 ### [Classification Output Interpretation, Analysis & Formatting Options...](docs/output.md)
 
+### [Generate Krona Plots From Abundance Results](docs/krona_plots.md)
+
+
 
 
 
@@ -211,6 +216,7 @@ or jump directly to a mode's man page with:
 
 
 
+
 MetaCache Copyright (C) 2016-2025 [André Müller](https://github.com/muellan) & [Robin Kobus](https://github.com/Funatiq)
 This program comes with ABSOLUTELY NO WARRANTY.
 This is free software, and you are welcome to redistribute it under certain conditions. See the file 'LICENSE' for details.
 
@@ -1406,9 +1406,7 @@ class action_provider
     //---------------------------------------------------------------
     /** @brief executes all argument actions */
     void execute_actions(const arg_string& arg) const {
-        int i = 0;
         for(const auto& a : argActions_) {
-            ++i;
             a(arg.c_str());
         }
     }
 
@@ -7,15 +7,16 @@ Project
  - DON'T introduce breaking changes to the (default) output format
  - try to avoid breaking database format changes
  - define all entities within namespace ```mc``` 
+ - for modes 'query', 'merge', etc. that produce classification output
+    - write classification results either to dedicated file or stdout
+    - write all other status information, error messages, etc. to stderr
 
 
 C++ Language Usage
 ------------------
- - C++14
+ - C++14 only for now
 
- - NO C++17 features (yet)
-
- - NO **owning** raw pointers
+ - NO **owning** raw pointers outside of data structures
    => Every resource must be cleaned up if it's owner is destroyed.
 
  - NO explicit ```new``` and ```delete``` 
@@ -26,32 +27,36 @@ C++ Language Usage
  - DON'T use ```#pragma once```, use include guards
  - DON'T use ```typedef A B;```, use ```using B = A;```
 
- - DON'T return meaningless pairs or tuples from functions
+ - DON'T return meaningless pairs or tuples from functions,
+   use sensibly named structs with meaningfully named members instead
  - DON'T use **unscoped** enums, use **scoped** enums: ```enum class { ... }```
  - avoid functions with more than 5 parameters
  - avoid out-parameters (non-const reference function parameters)
 
- - std::vector should be your default container choice
+ - std::vector should be the default container choice
  - prefer std::unordered_map/set over std::map/set
  - prefer range-based for loops
  - prefer std:: algorithms 
- - prefer free standing functions (over member functions)
+ - prefer free standing functions over member functions
  - prefer templates over std::function
+ - prefer concrete types/functions over generic ones
 
 
 Coding Style
 ------------------
  - indentation: 4 SPACES, NO Tabs
  - try to keep line lengths under 80-100 characters
- - file extensions: headers: ".h", TUs: ".cpp"
+ - file extensions: headers: ".hpp", TUs: ".cpp"
+ - if the body of a loop, an if statement etc. is on a separate line
+   from the loop head, if condition, etc. then put the body into curly braces
  - naming:
     - localVariables
     - memberVariables_
     - function_names        (should be verbs)
     - class_names           (should be nouns)
     - TemplateParameters    
     - DON'T use "_" at the begginning of names;
-      these are reserved for std:: library entities.
+      these are reserved for std:: library entities or compiler intrinsics.
 
 ```cpp
 #include <vector>
 
@@ -1,10 +1,8 @@
-# AFS-MetaCache: Food Ingredient Detection & Abundance Analysis
+# AFS-MetaCache2: Food Ingredient Detection & Abundance Analysis
 
 MetaCache is a classification system for mapping (short or long) reads from metagenomic samples to their most likely taxon of origin. It uses locality sensitive hashing to quickly identify candidate regions within one or multiple reference genomes. A read is then classified based on the similarity to those regions. 
 
-
-* [MetaCache Github Repository](https://github.com/muellan/metacache)
-* [**MetaCacheSpark**: Apache Spark&trade; implementation of MetaCache for big data clusters](https://github.com/jmabuin/MetaCacheSpark)
+[MetaCache Github Repository](https://github.com/muellan/metacache)
 
 
 
@@ -40,10 +38,12 @@ metacache build afs genomes_folder \
 
 It is important that you supply the option `-remove-overpopulated-features` if you add large eukaryotic genomes. This will improve classification accuracy and runtime performance.
 
+In case your workstation memory is not enough to fit the entire database, MetaCache allows partitioning into several smaller databases. These can be queried independently and the results can be merged to obtain final classifications.
+
 
 #### For more information see
-* [Building custom databases](building.md)
 * [Using partitioned databases](partitioning.md)
+* [Building custom databases](building.md)
 
 
 
@@ -68,6 +68,9 @@ If the option `-split-out` is given, mapping and abundance results will be writt
 for optimal results. This tells MetaCache to consider the 4 best matching candidates per read (the default is 2, which is fine for bacteria). It also makes sure that the best matching candidate wins over the lowest common ancestor of all candidates if the input read has at least 80% features more in common with this best candidate than it has with the 2nd best.
 
 
+##### [Generate Krona Plots From Abundance Results](docs/krona_plots.md)
+
+
 
 ## More Documentation
 
 
@@ -19,6 +19,7 @@ You need
 
 
 If your machine doesn't have enough RAM to fit an entire database you can use [database partitioning](partitioning.md) to split up the reference genomes into several partitions.
+As of version 2.6.0, a faster parallel database construction algorithm is used, however this can sometimes lead to slightly higher memory consumption. To switch back to the old construction scheme, you can supply command line option `-threads 1`.
 
 
 ## Taxonomic Hierarchy
Original file line number	Diff line number	Diff line change
`@@ -1406,9 +1406,7 @@ class action_provider`
`1406`	`1406`	`//---------------------------------------------------------------`
`1407`	`1407`	`/** @brief executes all argument actions */`
`1408`	`1408`	`void execute_actions(const arg_string& arg) const {`
`1409`		`- int i = 0;`
`1410`	`1409`	`for(const auto& a : argActions_) {`
`1411`		`- ++i;`
`1412`	`1410`	`a(arg.c_str());`
`1413`	`1411`	`}`
`1414`	`1412`	`}`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ You need`
`19`	`19`
`20`	`20`
`21`	`21`	`If your machine doesn't have enough RAM to fit an entire database you can use [database partitioning](partitioning.md) to split up the reference genomes into several partitions.`
	`22`	+As of version 2.6.0, a faster parallel database construction algorithm is used, however this can sometimes lead to slightly higher memory consumption. To switch back to the old construction scheme, you can supply command line option `-threads 1`.
`22`	`23`
`23`	`24`
`24`	`25`	`## Taxonomic Hierarchy`