添加README内容
This commit is contained in:
parent
01d14d539f
commit
52c5610f1c
54
README.md
54
README.md
|
|
@ -7,4 +7,56 @@ Use spdlog as log tool and the default level is 'info'.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
* Fast -
|
* Fast - with the same number of threads `FastDup` is ~3.5X faster than GATK MarkDuplicatesSpark.
|
||||||
|
And `FastDup` achives ~15X performance improvement than Picard MarkDuplicates.
|
||||||
|
* Generate identical outputs compared to Picard MarkDuplicates.
|
||||||
|
* The same detailed metrics data witch Picard MarkDuplicates.
|
||||||
|
* All data processed in memory and low-memory footprint even for large input files.
|
||||||
|
|
||||||
|
### Limitations
|
||||||
|
|
||||||
|
* Although `FastDup` can detecte all the same duplicates as Picard MarkDuplicates. They may mark
|
||||||
|
different reads as duplicates because the reads sort algorithm in Picard MarkDuplicates is unstable.
|
||||||
|
Considering there are 2 reads(A, B and A is in front of B in file) in a duplicate group and they
|
||||||
|
have the same score, Picard Markduplicates may mark A as duplicate because B may be in front of A
|
||||||
|
after sorting. While `FastDup` use stable sort algorithm and always mark B as duplicate.
|
||||||
|
* In optical duplicates detection, Picard Markduplicates use short (int16_t) as data type in parsing
|
||||||
|
tile/region, x coordinate and y coordinate from a read name, which may data overflow as these integers
|
||||||
|
may exceed the range of short type. `FastDup` fixes this bug. But for consistency with Picard Markduplicates,
|
||||||
|
we keep this bug in source codes. Just change the data type in PhysicalLocation struct in read_ends.h file
|
||||||
|
to fix this bug.
|
||||||
|
* `FastDup` use the data characteristics in coordinate ordered SAM/BAM files to improve the performance of
|
||||||
|
detecting duplicates, thus the input should be ordered by coordinate in advance.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### Build tools
|
||||||
|
|
||||||
|
* cmake
|
||||||
|
* c++17 (gcc >= 8.1 or clang >= 7 should work.)
|
||||||
|
|
||||||
|
### Libraries need
|
||||||
|
|
||||||
|
* zlib
|
||||||
|
* libbz2
|
||||||
|
* liblzma
|
||||||
|
* libcurl
|
||||||
|
* libdeflate (optional)
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
Download a distribution tarball `FastDup.tar.gz` or clone the source codes from github.
|
||||||
|
|
||||||
|
```
|
||||||
|
# build htslib
|
||||||
|
cd FastDup/ext/htslib
|
||||||
|
autoreconf -i
|
||||||
|
./configure
|
||||||
|
make
|
||||||
|
|
||||||
|
# build FastDup
|
||||||
|
cd FastDup
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||||
|
make && sudo make install
|
||||||
|
```
|
||||||
111
metrics.txt
111
metrics.txt
|
|
@ -1,111 +0,0 @@
|
||||||
## StringHeader
|
|
||||||
# /home/zzh/work/ngs/FastDup/build/bin/fastdup --input /home/zzh/data/bam/normal_all.sam --output /home/zzh/data1/out.sam --metrics ./metrics.txt --num-threads 1 --create-index --index-format CSI --tag-duplicate-set-members
|
|
||||||
## StringHeader
|
|
||||||
# Started on: December 16, 2024 at 02:43:41 AM CST
|
|
||||||
|
|
||||||
## METRICS
|
|
||||||
LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE
|
|
||||||
normal 108919 66154888 117508 161395 67414 15058359 876584 0.227945 127582703
|
|
||||||
|
|
||||||
## HISTOGRAM Double
|
|
||||||
BIN CoverageMult
|
|
||||||
1 1.010249
|
|
||||||
2 1.611750
|
|
||||||
3 1.969882
|
|
||||||
4 2.183113
|
|
||||||
5 2.310070
|
|
||||||
6 2.385660
|
|
||||||
7 2.430666
|
|
||||||
8 2.457463
|
|
||||||
9 2.473417
|
|
||||||
10 2.482917
|
|
||||||
11 2.488573
|
|
||||||
12 2.491940
|
|
||||||
13 2.493945
|
|
||||||
14 2.495139
|
|
||||||
15 2.495850
|
|
||||||
16 2.496273
|
|
||||||
17 2.496525
|
|
||||||
18 2.496675
|
|
||||||
19 2.496764
|
|
||||||
20 2.496817
|
|
||||||
21 2.496849
|
|
||||||
22 2.496868
|
|
||||||
23 2.496879
|
|
||||||
24 2.496886
|
|
||||||
25 2.496890
|
|
||||||
26 2.496892
|
|
||||||
27 2.496894
|
|
||||||
28 2.496894
|
|
||||||
29 2.496895
|
|
||||||
30 2.496895
|
|
||||||
31 2.496895
|
|
||||||
32 2.496896
|
|
||||||
33 2.496896
|
|
||||||
34 2.496896
|
|
||||||
35 2.496896
|
|
||||||
36 2.496896
|
|
||||||
37 2.496896
|
|
||||||
38 2.496896
|
|
||||||
39 2.496896
|
|
||||||
40 2.496896
|
|
||||||
41 2.496896
|
|
||||||
42 2.496896
|
|
||||||
43 2.496896
|
|
||||||
44 2.496896
|
|
||||||
45 2.496896
|
|
||||||
46 2.496896
|
|
||||||
47 2.496896
|
|
||||||
48 2.496896
|
|
||||||
49 2.496896
|
|
||||||
50 2.496896
|
|
||||||
51 2.496896
|
|
||||||
52 2.496896
|
|
||||||
53 2.496896
|
|
||||||
54 2.496896
|
|
||||||
55 2.496896
|
|
||||||
56 2.496896
|
|
||||||
57 2.496896
|
|
||||||
58 2.496896
|
|
||||||
59 2.496896
|
|
||||||
60 2.496896
|
|
||||||
61 2.496896
|
|
||||||
62 2.496896
|
|
||||||
63 2.496896
|
|
||||||
64 2.496896
|
|
||||||
65 2.496896
|
|
||||||
66 2.496896
|
|
||||||
67 2.496896
|
|
||||||
68 2.496896
|
|
||||||
69 2.496896
|
|
||||||
70 2.496896
|
|
||||||
71 2.496896
|
|
||||||
72 2.496896
|
|
||||||
73 2.496896
|
|
||||||
74 2.496896
|
|
||||||
75 2.496896
|
|
||||||
76 2.496896
|
|
||||||
77 2.496896
|
|
||||||
78 2.496896
|
|
||||||
79 2.496896
|
|
||||||
80 2.496896
|
|
||||||
81 2.496896
|
|
||||||
82 2.496896
|
|
||||||
83 2.496896
|
|
||||||
84 2.496896
|
|
||||||
85 2.496896
|
|
||||||
86 2.496896
|
|
||||||
87 2.496896
|
|
||||||
88 2.496896
|
|
||||||
89 2.496896
|
|
||||||
90 2.496896
|
|
||||||
91 2.496896
|
|
||||||
92 2.496896
|
|
||||||
93 2.496896
|
|
||||||
94 2.496896
|
|
||||||
95 2.496896
|
|
||||||
96 2.496896
|
|
||||||
97 2.496896
|
|
||||||
98 2.496896
|
|
||||||
99 2.496896
|
|
||||||
100 2.496896
|
|
||||||
|
|
@ -303,7 +303,7 @@ int MarkDuplicates() {
|
||||||
bam_aux_append(bw->b, "PG", 'Z', nsgv::gMdArg.PROGRAM_RECORD_ID.size() + 1,
|
bam_aux_append(bw->b, "PG", 'Z', nsgv::gMdArg.PROGRAM_RECORD_ID.size() + 1,
|
||||||
(const uint8_t *)nsgv::gMdArg.PROGRAM_RECORD_ID.c_str());
|
(const uint8_t *)nsgv::gMdArg.PROGRAM_RECORD_ID.c_str());
|
||||||
}
|
}
|
||||||
#if 0
|
#if 1
|
||||||
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
||||||
spdlog::error("failed writing sam record to \"{}\"", nsgv::gMdArg.OUTPUT_FILE.c_str());
|
spdlog::error("failed writing sam record to \"{}\"", nsgv::gMdArg.OUTPUT_FILE.c_str());
|
||||||
sam_close(nsgv::gOutBamFp);
|
sam_close(nsgv::gOutBamFp);
|
||||||
|
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
/*
|
|
||||||
Description: Markduplicate需要用到的一些参数,读取命令行给的参数,并做一些初始化
|
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
|
||||||
|
|
||||||
Author : Zhang Zhonghai
|
|
||||||
Date : 2023/10/27
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "md_args.h"
|
|
||||||
using std::cout, std::endl;
|
|
||||||
|
|
||||||
using std::ostringstream;
|
|
||||||
using std::stod;
|
|
||||||
using std::stoi;
|
|
||||||
using std::stol;
|
|
||||||
using std::string;
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
using namespace nsmd;
|
|
||||||
Loading…
Reference in New Issue