添加README内容
This commit is contained in:
parent
01d14d539f
commit
52c5610f1c
54
README.md
54
README.md
|
|
@ -7,4 +7,56 @@ Use spdlog as log tool and the default level is 'info'.
|
|||
|
||||
### Features
|
||||
|
||||
* Fast -
|
||||
* Fast - with the same number of threads `FastDup` is ~3.5X faster than GATK MarkDuplicatesSpark.
|
||||
And `FastDup` achives ~15X performance improvement than Picard MarkDuplicates.
|
||||
* Generate identical outputs compared to Picard MarkDuplicates.
|
||||
* The same detailed metrics data witch Picard MarkDuplicates.
|
||||
* All data processed in memory and low-memory footprint even for large input files.
|
||||
|
||||
### Limitations
|
||||
|
||||
* Although `FastDup` can detecte all the same duplicates as Picard MarkDuplicates. They may mark
|
||||
different reads as duplicates because the reads sort algorithm in Picard MarkDuplicates is unstable.
|
||||
Considering there are 2 reads(A, B and A is in front of B in file) in a duplicate group and they
|
||||
have the same score, Picard Markduplicates may mark A as duplicate because B may be in front of A
|
||||
after sorting. While `FastDup` use stable sort algorithm and always mark B as duplicate.
|
||||
* In optical duplicates detection, Picard Markduplicates use short (int16_t) as data type in parsing
|
||||
tile/region, x coordinate and y coordinate from a read name, which may data overflow as these integers
|
||||
may exceed the range of short type. `FastDup` fixes this bug. But for consistency with Picard Markduplicates,
|
||||
we keep this bug in source codes. Just change the data type in PhysicalLocation struct in read_ends.h file
|
||||
to fix this bug.
|
||||
* `FastDup` use the data characteristics in coordinate ordered SAM/BAM files to improve the performance of
|
||||
detecting duplicates, thus the input should be ordered by coordinate in advance.
|
||||
|
||||
## Requirements
|
||||
|
||||
### Build tools
|
||||
|
||||
* cmake
|
||||
* c++17 (gcc >= 8.1 or clang >= 7 should work.)
|
||||
|
||||
### Libraries need
|
||||
|
||||
* zlib
|
||||
* libbz2
|
||||
* liblzma
|
||||
* libcurl
|
||||
* libdeflate (optional)
|
||||
|
||||
## Install
|
||||
|
||||
Download a distribution tarball `FastDup.tar.gz` or clone the source codes from github.
|
||||
|
||||
```
|
||||
# build htslib
|
||||
cd FastDup/ext/htslib
|
||||
autoreconf -i
|
||||
./configure
|
||||
make
|
||||
|
||||
# build FastDup
|
||||
cd FastDup
|
||||
mkdir build && cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||
make && sudo make install
|
||||
```
|
||||
111
metrics.txt
111
metrics.txt
|
|
@ -1,111 +0,0 @@
|
|||
## StringHeader
|
||||
# /home/zzh/work/ngs/FastDup/build/bin/fastdup --input /home/zzh/data/bam/normal_all.sam --output /home/zzh/data1/out.sam --metrics ./metrics.txt --num-threads 1 --create-index --index-format CSI --tag-duplicate-set-members
|
||||
## StringHeader
|
||||
# Started on: December 16, 2024 at 02:43:41 AM CST
|
||||
|
||||
## METRICS
|
||||
LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE
|
||||
normal 108919 66154888 117508 161395 67414 15058359 876584 0.227945 127582703
|
||||
|
||||
## HISTOGRAM Double
|
||||
BIN CoverageMult
|
||||
1 1.010249
|
||||
2 1.611750
|
||||
3 1.969882
|
||||
4 2.183113
|
||||
5 2.310070
|
||||
6 2.385660
|
||||
7 2.430666
|
||||
8 2.457463
|
||||
9 2.473417
|
||||
10 2.482917
|
||||
11 2.488573
|
||||
12 2.491940
|
||||
13 2.493945
|
||||
14 2.495139
|
||||
15 2.495850
|
||||
16 2.496273
|
||||
17 2.496525
|
||||
18 2.496675
|
||||
19 2.496764
|
||||
20 2.496817
|
||||
21 2.496849
|
||||
22 2.496868
|
||||
23 2.496879
|
||||
24 2.496886
|
||||
25 2.496890
|
||||
26 2.496892
|
||||
27 2.496894
|
||||
28 2.496894
|
||||
29 2.496895
|
||||
30 2.496895
|
||||
31 2.496895
|
||||
32 2.496896
|
||||
33 2.496896
|
||||
34 2.496896
|
||||
35 2.496896
|
||||
36 2.496896
|
||||
37 2.496896
|
||||
38 2.496896
|
||||
39 2.496896
|
||||
40 2.496896
|
||||
41 2.496896
|
||||
42 2.496896
|
||||
43 2.496896
|
||||
44 2.496896
|
||||
45 2.496896
|
||||
46 2.496896
|
||||
47 2.496896
|
||||
48 2.496896
|
||||
49 2.496896
|
||||
50 2.496896
|
||||
51 2.496896
|
||||
52 2.496896
|
||||
53 2.496896
|
||||
54 2.496896
|
||||
55 2.496896
|
||||
56 2.496896
|
||||
57 2.496896
|
||||
58 2.496896
|
||||
59 2.496896
|
||||
60 2.496896
|
||||
61 2.496896
|
||||
62 2.496896
|
||||
63 2.496896
|
||||
64 2.496896
|
||||
65 2.496896
|
||||
66 2.496896
|
||||
67 2.496896
|
||||
68 2.496896
|
||||
69 2.496896
|
||||
70 2.496896
|
||||
71 2.496896
|
||||
72 2.496896
|
||||
73 2.496896
|
||||
74 2.496896
|
||||
75 2.496896
|
||||
76 2.496896
|
||||
77 2.496896
|
||||
78 2.496896
|
||||
79 2.496896
|
||||
80 2.496896
|
||||
81 2.496896
|
||||
82 2.496896
|
||||
83 2.496896
|
||||
84 2.496896
|
||||
85 2.496896
|
||||
86 2.496896
|
||||
87 2.496896
|
||||
88 2.496896
|
||||
89 2.496896
|
||||
90 2.496896
|
||||
91 2.496896
|
||||
92 2.496896
|
||||
93 2.496896
|
||||
94 2.496896
|
||||
95 2.496896
|
||||
96 2.496896
|
||||
97 2.496896
|
||||
98 2.496896
|
||||
99 2.496896
|
||||
100 2.496896
|
||||
|
|
@ -303,7 +303,7 @@ int MarkDuplicates() {
|
|||
bam_aux_append(bw->b, "PG", 'Z', nsgv::gMdArg.PROGRAM_RECORD_ID.size() + 1,
|
||||
(const uint8_t *)nsgv::gMdArg.PROGRAM_RECORD_ID.c_str());
|
||||
}
|
||||
#if 0
|
||||
#if 1
|
||||
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
||||
spdlog::error("failed writing sam record to \"{}\"", nsgv::gMdArg.OUTPUT_FILE.c_str());
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
|
|
|
|||
|
|
@ -1,26 +0,0 @@
|
|||
/*
|
||||
Description: Markduplicate需要用到的一些参数,读取命令行给的参数,并做一些初始化
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2023/10/27
|
||||
*/
|
||||
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "md_args.h"
|
||||
using std::cout, std::endl;
|
||||
|
||||
using std::ostringstream;
|
||||
using std::stod;
|
||||
using std::stoi;
|
||||
using std::stol;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
using namespace nsmd;
|
||||
Loading…
Reference in New Issue