2014-11-14 04:51:00 +08:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
|
use warnings;
|
|
|
|
|
use Getopt::Std;
|
|
|
|
|
|
|
|
|
|
my %opts = (t=>1, n=>64);
|
2014-11-14 05:16:21 +08:00
|
|
|
getopts("SpADso:R:x:t:", \%opts);
|
2014-11-14 04:51:00 +08:00
|
|
|
|
|
|
|
|
die('
|
|
|
|
|
Usage: run-bwamem [options] <idxbase> <file1> [file2]
|
|
|
|
|
|
2014-11-14 05:16:21 +08:00
|
|
|
Options: -o STR prefix for output files [inferred from input]
|
2014-11-14 04:51:00 +08:00
|
|
|
-R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
|
|
|
|
|
-x STR read type: pacbio, ont2d or intractg [default]
|
|
|
|
|
-t INT number of threads [1]
|
2014-11-14 05:16:21 +08:00
|
|
|
-p input are paired-end reads if file2 absent
|
2014-11-14 04:51:00 +08:00
|
|
|
|
|
|
|
|
-A skip HiSeq2000/2500 PE resequencing adapter trimming (via trimadap)
|
|
|
|
|
-D skip duplicate marking (via samblaster)
|
|
|
|
|
-S for SAM/BAM input, don\'t shuffle
|
|
|
|
|
-s sort the output alignment (more RAM)
|
|
|
|
|
|
|
|
|
|
Note: File type is determined by the file extension of the first input sequence file.
|
|
|
|
|
Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz,
|
|
|
|
|
fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ.
|
|
|
|
|
|
|
|
|
|
') if @ARGV < 2;
|
|
|
|
|
|
|
|
|
|
warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") unless defined($opts{R});
|
|
|
|
|
|
|
|
|
|
my $idx = $ARGV[0];
|
|
|
|
|
|
|
|
|
|
my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
|
|
|
|
|
my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
|
|
|
|
|
die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
|
|
|
|
|
|
|
|
|
|
die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
|
|
|
|
|
unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
|
|
|
|
|
|
|
|
|
|
if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
|
|
|
|
|
warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
|
|
|
|
|
@ARGV = 2;
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-14 05:16:21 +08:00
|
|
|
if (defined($opts{p}) && @ARGV >= 3) {
|
2014-11-14 04:51:00 +08:00
|
|
|
warn("WARNING: option -P is ignored as there are two input sequence files.\n");
|
2014-11-14 05:16:21 +08:00
|
|
|
delete $opts{p};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
my $prefix;
|
|
|
|
|
if (defined $opts{o}) {
|
|
|
|
|
$prefix = $opts{o};
|
|
|
|
|
} elsif (@ARGV >= 3) {
|
|
|
|
|
my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
|
|
|
|
|
my $i;
|
|
|
|
|
for ($i = 0; $i < $len; ++$i) {
|
|
|
|
|
last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
|
|
|
|
|
}
|
|
|
|
|
$prefix = substr($ARGV[1], 0, $i) if $i > 0;
|
|
|
|
|
} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|sam|sam\.gz|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
|
|
|
|
|
$prefix = $1;
|
2014-11-14 04:51:00 +08:00
|
|
|
}
|
2014-11-14 05:16:21 +08:00
|
|
|
die("ERROR: failed to identify the prefix for output. Please specify -p.\n") unless defined($prefix);
|
2014-11-14 04:51:00 +08:00
|
|
|
|
|
|
|
|
my $size = 0;
|
|
|
|
|
my $comp_ratio = 3.;
|
|
|
|
|
for my $f (@ARGV[1..$#ARGV]) {
|
|
|
|
|
my @a = stat($f);
|
|
|
|
|
my $s = $a[7];
|
2014-11-14 05:16:21 +08:00
|
|
|
die("ERROR: failed to read file $f\n") if !defined($s);
|
2014-11-14 04:51:00 +08:00
|
|
|
$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
|
|
|
|
|
$size += int($s) + 1;
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-14 05:16:21 +08:00
|
|
|
my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
|
2014-11-14 04:51:00 +08:00
|
|
|
my $is_sam = $ARGV[1] =~ /\.(sam|sam\.gz)$/? 1 : 0;
|
|
|
|
|
my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
|
|
|
|
|
|
|
|
|
|
my $cmd = '';
|
|
|
|
|
if ($is_sam || $is_bam) {
|
|
|
|
|
my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n";
|
|
|
|
|
my $ntmps = int($size / 4e9) + 1;
|
|
|
|
|
my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
|
|
|
|
|
my $cmd_bam2fq = "";
|
2014-11-14 05:16:21 +08:00
|
|
|
$cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.out.se.fq.gz " : "") . "- \\\n";
|
2014-11-14 04:51:00 +08:00
|
|
|
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
|
|
|
|
|
} elsif (@ARGV >= 3) {
|
|
|
|
|
$cmd = " | $root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
|
|
|
|
|
} else {
|
|
|
|
|
$cmd = "cat $ARGV[1] \\\n";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$cmd .= " | $root/trimadap \\\n" unless defined($opts{A});
|
|
|
|
|
$cmd .= " | $root/bwa mem -t$opts{t} " . (defined($opts{x})? "-x $opts{x} " : "") . ($is_pe? "-P " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
|
|
|
|
|
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" unless defined($opts{D});
|
|
|
|
|
|
|
|
|
|
my $has_hla = 0;
|
|
|
|
|
if (-f "$ARGV[0].alt") {
|
|
|
|
|
my $fh;
|
|
|
|
|
open($fh, "$ARGV[0].alt") || die;
|
|
|
|
|
while (<$fh>) {
|
|
|
|
|
$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
|
|
|
|
|
}
|
|
|
|
|
close($fh);
|
|
|
|
|
my $hla_pre = $has_hla? "-p $prefix.out " : "";
|
|
|
|
|
$cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
my $t_sort = $opts{t} < 4? $opts{t} : 4;
|
|
|
|
|
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.out;\n" : " | gzip -1 > $prefix.out.sam.gz;\n";
|
|
|
|
|
$cmd .= "$root/run-HLA $prefix.out;\n" if ($has_hla);
|
|
|
|
|
|
|
|
|
|
print $cmd;
|
|
|
|
|
|
|
|
|
|
sub which
|
|
|
|
|
{
|
|
|
|
|
my $file = shift;
|
|
|
|
|
my $path = (@_)? shift : $ENV{PATH};
|
|
|
|
|
return if (!defined($path));
|
|
|
|
|
foreach my $x (split(":", $path)) {
|
|
|
|
|
$x =~ s/\/$//;
|
|
|
|
|
return "$x/$file" if (-x "$x/$file");
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|