@article{jellyfish, title = {A fast, lock-free approach for efficient parallel counting of occurrences of k-mers}, volume = {27}, url = {http://bioinformatics.oxfordjournals.org/content/27/6/764.abstract}, abstract = {Motivation: Counting the number of occurrences of every k-mer (substring of length k) in a long string is a central subproblem in many applications, including genome assembly, error correction of sequencing reads, fast multiple sequence alignment and repeat detection. Recently, the deep sequence coverage generated by next-generation sequencing technologies has caused the amount of sequence to be processed during a genome project to grow rapidly, and has rendered current k-mer counting tools too slow and memory intensive. At the same time, large multicore computers have become commonplace in research facilities allowing for a new parallel computational {paradigm.Results:} We propose a new k-mer counting algorithm and associated implementation, called Jellyfish, which is fast and memory efficient. It is based on a multithreaded, lock-free hash table optimized for counting k-mers up to 31 bases in length. Due to their flexibility, suffix arrays have been the data structure of choice for solving many string problems. For the task of k-mer counting, important in many biological applications, Jellyfish offers a much faster and more memory-efficient {solution.Availability:} The Jellyfish software is written in C++ and is {GPL} licensed. It is available for download at {http://www.cbcb.umd.edu/software/jellyfish.Contact:} {[email protected]} information: Supplementary data are available at Bioinformatics online.}, number = {6}, journal = {Bioinformatics}, author = {Mar\c{c}ais, Guillaume and Kingsford, Carl}, year = {2011}, pages = {764--770} }, @Article{bfcounter, AUTHOR = {Melsted, Pall and Pritchard, Jonathan}, TITLE = {Efficient counting of k-mers in DNA sequences using a Bloom filter}, JOURNAL = {BMC Bioinformatics}, VOLUME = {12}, YEAR = {2011}, NUMBER = {1}, PAGES = {333}, URL = {http://www.biomedcentral.com/1471-2105/12/333}, DOI = {10.1186/1471-2105-12-333}, PubMedID = {21831268}, ISSN = {1471-2105}, ABSTRACT = {BACKGROUND:Counting k-mers (substrings of length k in DNA sequence data) is an essential component of many methods in bioinformatics, including for genome and transcriptome assembly, for metagenomic sequencing, and for error correction of sequence reads. Although simple in principle, counting k-mers in large modern sequence data sets can easily overwhelm the memory capacity of standard computers. In current data sets, a large fraction-often more than 50%-of the storage capacity may be spent on storing k-mers that contain sequencing errors and which are typically observed only a single time in the data. These singleton k-mers are uninformative for many algorithms without some kind of error correction.RESULTS:We present a new method that identifies all the k-mers that occur more than once in a DNA sequence data set. Our method does this using a Bloom filter, a probabilistic data structure that stores all the observed k-mers implicitly in memory with greatly reduced memory requirements. We then make a second sweep through the data to provide exact counts of all nonunique k-mers. For example data sets, we report up to 50% savings in memory usage compared to current software, with modest costs in computational speed. This approach may reduce memory requirements for any algorithm that starts by counting k-mers in sequence data with errors.CONCLUSIONS:A reference implementation for this methodology, BFCounter, is written in C++ and is GPL licensed. It is available for free download at http://pritch.bsd.uchicago.edu/bfcounter.html webcite}, },