fast-bwa/extras/run-bwamem

162 lines
6.0 KiB
Plaintext
Raw Normal View History

2014-11-14 04:51:00 +08:00
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Std;
my %opts = (t=>1, n=>64);
2014-11-17 23:44:54 +08:00
getopts("SpAdsko:R:x:t:", \%opts);
2014-11-14 04:51:00 +08:00
die('
Usage: run-bwamem [options] <idxbase> <file1> [file2]
2014-11-14 05:16:21 +08:00
Options: -o STR prefix for output files [inferred from input]
2014-11-14 04:51:00 +08:00
-R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
-x STR read type: pacbio, ont2d or intractg [default]
2014-11-14 08:52:47 +08:00
intractg: intra-species contig (kb query, highly similar)
pacbio: pacbio subreads (~10kb query, high error rate)
ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
2014-11-14 04:51:00 +08:00
-t INT number of threads [1]
2014-11-14 05:16:21 +08:00
-p input are paired-end reads if file2 absent
2014-11-14 04:51:00 +08:00
-A skip HiSeq2000/2500 PE resequencing adapter trimming (via trimadap)
2014-11-17 23:44:54 +08:00
-d mark duplicate (via samblaster)
2014-11-14 04:51:00 +08:00
-S for SAM/BAM input, don\'t shuffle
-s sort the output alignment (more RAM)
2014-11-14 08:52:47 +08:00
-k keep temporary files generated by typeHLA
2014-11-14 04:51:00 +08:00
Note: File type is determined by the file extension of the first input sequence file.
Recognized file extensions: fasta, fa, fastq, fq, fasta.gz, fa.gz, fastq.gz,
fq.gz, sam, sam.gz and bam. SAM/BAM input will be converted to FASTQ.
2014-11-17 23:44:54 +08:00
When option -d is in use, all the input reads are required to come from the same
library and all the reads from the library shall be included in the input. In
addition, samblaster does not distinguish optical and PCR duplicates.
The output may consist of the following files:
{-o}.aln.sam.gz - unsorted alignment (unless -s is specified)
{-o}.aln.bam - sorted alignment (if -s is specified)
{-o}.oph.sam.gz - orphan single-end reads mapping (if input is paired SAM/BAM)
{-o}.hla.top - best HLA typing for the six classical HLA genes
{-o}.hla.all - additional HLA typing
{-o}.log.* - log files
2014-11-14 04:51:00 +08:00
') if @ARGV < 2;
warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") unless defined($opts{R});
my $idx = $ARGV[0];
my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
@ARGV = 2;
}
2014-11-14 05:16:21 +08:00
if (defined($opts{p}) && @ARGV >= 3) {
2014-11-14 04:51:00 +08:00
warn("WARNING: option -P is ignored as there are two input sequence files.\n");
2014-11-14 05:16:21 +08:00
delete $opts{p};
}
my $prefix;
if (defined $opts{o}) {
$prefix = $opts{o};
} elsif (@ARGV >= 3) {
my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
my $i;
for ($i = 0; $i < $len; ++$i) {
last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
}
$prefix = substr($ARGV[1], 0, $i) if $i > 0;
} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|sam|sam\.gz|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
$prefix = $1;
2014-11-14 04:51:00 +08:00
}
2014-11-14 05:16:21 +08:00
die("ERROR: failed to identify the prefix for output. Please specify -p.\n") unless defined($prefix);
2014-11-14 04:51:00 +08:00
my $size = 0;
my $comp_ratio = 3.;
for my $f (@ARGV[1..$#ARGV]) {
my @a = stat($f);
my $s = $a[7];
2014-11-14 05:16:21 +08:00
die("ERROR: failed to read file $f\n") if !defined($s);
2014-11-14 04:51:00 +08:00
$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
$size += int($s) + 1;
}
2014-11-14 05:16:21 +08:00
my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
2014-11-14 04:51:00 +08:00
my $is_sam = $ARGV[1] =~ /\.(sam|sam\.gz)$/? 1 : 0;
my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
2014-11-14 08:52:47 +08:00
if (defined($opts{x})) {
2014-11-17 23:44:54 +08:00
$opts{A} = 1; delete($opts{d});
2014-11-14 08:52:47 +08:00
delete $opts{p};
}
2014-11-14 04:51:00 +08:00
my $cmd = '';
if ($is_sam || $is_bam) {
my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n";
my $ntmps = int($size / 4e9) + 1;
my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
my $cmd_bam2fq = "";
2014-11-17 23:44:54 +08:00
$cmd_bam2fq = " | $root/htsbox bam2fq -O " . (defined($opts{p})? "-s $prefix.oph.fq.gz " : "") . "- \\\n";
2014-11-14 04:51:00 +08:00
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
} elsif (@ARGV >= 3) {
2014-11-14 08:52:47 +08:00
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
2014-11-14 04:51:00 +08:00
} else {
$cmd = "cat $ARGV[1] \\\n";
}
2014-11-17 23:44:54 +08:00
my $bwa_opts = ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
2014-11-14 04:51:00 +08:00
$cmd .= " | $root/trimadap \\\n" unless defined($opts{A});
2014-11-17 23:44:54 +08:00
$cmd .= " | $root/bwa mem $bwa_opts" . ($is_pe? "-p " : "") . "$ARGV[0] - 2> $prefix.log.bwamem \\\n";
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
2014-11-14 04:51:00 +08:00
my $has_hla = 0;
if (-f "$ARGV[0].alt") {
my $fh;
open($fh, "$ARGV[0].alt") || die;
while (<$fh>) {
$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
}
close($fh);
2014-11-17 23:44:54 +08:00
my $hla_pre = $has_hla? "-p $prefix.hla " : "";
2014-11-14 04:51:00 +08:00
$cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
}
my $t_sort = $opts{t} < 4? $opts{t} : 4;
2014-11-17 23:44:54 +08:00
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | gzip -1 > $prefix.aln.sam.gz;\n";
# map single end reads
if ($cmd =~ /$prefix.oph.fq.gz/) {
$cmd .= "[ -s $prefix.oph.fq.gz ] && ";
$cmd .= "$root/bwa mem $bwa_opts$ARGV[0] $prefix.oph.fq.gz 2>> $prefix.log.bwamem | gzip -1 > $prefix.oph.sam.gz; rm -f $prefix.oph.fq.gz;\n";
}
2014-11-14 08:52:47 +08:00
if ($has_hla && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
2014-11-17 23:44:54 +08:00
$cmd .= "$root/run-HLA ". ($opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
$cmd .= "cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
2014-11-14 08:52:47 +08:00
}
2014-11-14 04:51:00 +08:00
print $cmd;
sub which
{
my $file = shift;
my $path = (@_)? shift : $ENV{PATH};
return if (!defined($path));
foreach my $x (split(":", $path)) {
$x =~ s/\/$//;
return "$x/$file" if (-x "$x/$file");
}
return;
}