diff --git a/README.md b/README.md index 0095767..ff8ca57 100644 --- a/README.md +++ b/README.md @@ -7,4 +7,56 @@ Use spdlog as log tool and the default level is 'info'. ### Features -* Fast - \ No newline at end of file +* Fast - with the same number of threads `FastDup` is ~3.5X faster than GATK MarkDuplicatesSpark. + And `FastDup` achives ~15X performance improvement than Picard MarkDuplicates. +* Generate identical outputs compared to Picard MarkDuplicates. +* The same detailed metrics data witch Picard MarkDuplicates. +* All data processed in memory and low-memory footprint even for large input files. + +### Limitations + +* Although `FastDup` can detecte all the same duplicates as Picard MarkDuplicates. They may mark + different reads as duplicates because the reads sort algorithm in Picard MarkDuplicates is unstable. + Considering there are 2 reads(A, B and A is in front of B in file) in a duplicate group and they + have the same score, Picard Markduplicates may mark A as duplicate because B may be in front of A + after sorting. While `FastDup` use stable sort algorithm and always mark B as duplicate. +* In optical duplicates detection, Picard Markduplicates use short (int16_t) as data type in parsing + tile/region, x coordinate and y coordinate from a read name, which may data overflow as these integers + may exceed the range of short type. `FastDup` fixes this bug. But for consistency with Picard Markduplicates, + we keep this bug in source codes. Just change the data type in PhysicalLocation struct in read_ends.h file + to fix this bug. +* `FastDup` use the data characteristics in coordinate ordered SAM/BAM files to improve the performance of + detecting duplicates, thus the input should be ordered by coordinate in advance. + +## Requirements + +### Build tools + +* cmake +* c++17 (gcc >= 8.1 or clang >= 7 should work.) + +### Libraries need + +* zlib +* libbz2 +* liblzma +* libcurl +* libdeflate (optional) + +## Install + +Download a distribution tarball `FastDup.tar.gz` or clone the source codes from github. + +``` +# build htslib +cd FastDup/ext/htslib +autoreconf -i +./configure +make + +# build FastDup +cd FastDup +mkdir build && cd build +cmake .. -DCMAKE_BUILD_TYPE=Release +make && sudo make install +``` \ No newline at end of file diff --git a/metrics.txt b/metrics.txt deleted file mode 100644 index bb5ff03..0000000 --- a/metrics.txt +++ /dev/null @@ -1,111 +0,0 @@ -## StringHeader -# /home/zzh/work/ngs/FastDup/build/bin/fastdup --input /home/zzh/data/bam/normal_all.sam --output /home/zzh/data1/out.sam --metrics ./metrics.txt --num-threads 1 --create-index --index-format CSI --tag-duplicate-set-members -## StringHeader -# Started on: December 16, 2024 at 02:43:41 AM CST - -## METRICS -LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE -normal 108919 66154888 117508 161395 67414 15058359 876584 0.227945 127582703 - -## HISTOGRAM Double -BIN CoverageMult -1 1.010249 -2 1.611750 -3 1.969882 -4 2.183113 -5 2.310070 -6 2.385660 -7 2.430666 -8 2.457463 -9 2.473417 -10 2.482917 -11 2.488573 -12 2.491940 -13 2.493945 -14 2.495139 -15 2.495850 -16 2.496273 -17 2.496525 -18 2.496675 -19 2.496764 -20 2.496817 -21 2.496849 -22 2.496868 -23 2.496879 -24 2.496886 -25 2.496890 -26 2.496892 -27 2.496894 -28 2.496894 -29 2.496895 -30 2.496895 -31 2.496895 -32 2.496896 -33 2.496896 -34 2.496896 -35 2.496896 -36 2.496896 -37 2.496896 -38 2.496896 -39 2.496896 -40 2.496896 -41 2.496896 -42 2.496896 -43 2.496896 -44 2.496896 -45 2.496896 -46 2.496896 -47 2.496896 -48 2.496896 -49 2.496896 -50 2.496896 -51 2.496896 -52 2.496896 -53 2.496896 -54 2.496896 -55 2.496896 -56 2.496896 -57 2.496896 -58 2.496896 -59 2.496896 -60 2.496896 -61 2.496896 -62 2.496896 -63 2.496896 -64 2.496896 -65 2.496896 -66 2.496896 -67 2.496896 -68 2.496896 -69 2.496896 -70 2.496896 -71 2.496896 -72 2.496896 -73 2.496896 -74 2.496896 -75 2.496896 -76 2.496896 -77 2.496896 -78 2.496896 -79 2.496896 -80 2.496896 -81 2.496896 -82 2.496896 -83 2.496896 -84 2.496896 -85 2.496896 -86 2.496896 -87 2.496896 -88 2.496896 -89 2.496896 -90 2.496896 -91 2.496896 -92 2.496896 -93 2.496896 -94 2.496896 -95 2.496896 -96 2.496896 -97 2.496896 -98 2.496896 -99 2.496896 -100 2.496896 diff --git a/src/markdup/markdup.cpp b/src/markdup/markdup.cpp index 37067a6..97d552a 100644 --- a/src/markdup/markdup.cpp +++ b/src/markdup/markdup.cpp @@ -303,7 +303,7 @@ int MarkDuplicates() { bam_aux_append(bw->b, "PG", 'Z', nsgv::gMdArg.PROGRAM_RECORD_ID.size() + 1, (const uint8_t *)nsgv::gMdArg.PROGRAM_RECORD_ID.c_str()); } -#if 0 +#if 1 if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) { spdlog::error("failed writing sam record to \"{}\"", nsgv::gMdArg.OUTPUT_FILE.c_str()); sam_close(nsgv::gOutBamFp); diff --git a/src/markdup/md_args.cpp b/src/markdup/md_args.cpp deleted file mode 100644 index dfb4fda..0000000 --- a/src/markdup/md_args.cpp +++ /dev/null @@ -1,26 +0,0 @@ -/* -Description: Markduplicate需要用到的一些参数,读取命令行给的参数,并做一些初始化 - -Copyright : All right reserved by ICT - -Author : Zhang Zhonghai -Date : 2023/10/27 -*/ - -#include -#include -#include -#include -#include - -#include "md_args.h" -using std::cout, std::endl; - -using std::ostringstream; -using std::stod; -using std::stoi; -using std::stol; -using std::string; -using std::vector; - -using namespace nsmd; \ No newline at end of file