2009-03-16 06:42:24 +08:00
package org.broadinstitute.sting.gatk.walkers ;
import org.broadinstitute.sting.gatk.LocusContext ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP ;
2009-04-04 03:54:54 +08:00
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker ;
2009-03-29 04:37:27 +08:00
import org.broadinstitute.sting.utils.cmdLine.Argument ;
2009-04-15 06:13:10 +08:00
import org.broadinstitute.sting.utils.ReadBackedPileup ;
2009-05-22 06:25:37 +08:00
import org.broadinstitute.sting.utils.Utils ;
import net.sf.samtools.SAMRecord ;
import java.util.List ;
2009-03-16 06:42:24 +08:00
/ * *
2009-05-22 06:25:37 +08:00
* samtools pileup [ - f in . ref . fasta ] [ - t in . ref_list ] [ - l in . site_list ] [ - iscg ] [ - T theta ] [ - N nHap ] [ - r pairDiffRate ] < in . alignment >
*
* Print the alignment in the pileup format . In the pileup format , each line represents a genomic position ,
* consisting of chromosome name , coordinate , reference base , read bases , read qualities and alignment mapping
* qualities . Information on match , mismatch , indel , strand , mapping quality and start and end of a read are all
* encoded at the read base column . At this column , a dot stands for a match to the reference base on the forward strand ,
* a comma for a match on the reverse strand , <EFBFBD> ACGTN <EFBFBD> for a mismatch on the forward strand and <EFBFBD> acgtn <EFBFBD> for a mismatch on the
* reverse strand .
*
* A pattern <EFBFBD> \ + [ 0 - 9 ] + [ ACGTNacgtn ] + <EFBFBD> indicates there is an insertion between this reference position and the next
* reference position . The length of the insertion is given by the integer in the pattern , followed by the inserted sequence .
* Similarly , a pattern <EFBFBD> - [ 0 - 9 ] + [ ACGTNacgtn ] + <EFBFBD> represents a deletion from the reference .
* Also at the read base column , a symbol <EFBFBD> ^ <EFBFBD> marks the start of a read segment which is a contiguous subsequence on the read
* separated by <EFBFBD> N / S / H <EFBFBD> CIGAR operations . The ASCII of the character following <EFBFBD> ^ <EFBFBD> minus 33 gives the mapping quality .
* A symbol <EFBFBD> $ <EFBFBD> marks the end of a read segment .
2009-03-16 06:42:24 +08:00
* /
2009-05-02 05:40:46 +08:00
public class PileupWalker extends LocusWalker < Integer , Integer > implements TreeReducible < Integer > {
2009-05-22 06:25:37 +08:00
@Argument ( fullName = "alwaysShowSecondBase" , doc = "If true, prints dummy bases for the second bases in the BAM file where they are missing" , required = false )
public boolean alwaysShowSecondBase = false ;
@Argument ( fullName = "showSecondBaseQuals" , doc = "If true, prints out second base qualities in the pileup" , required = false )
public boolean showSecondBaseQuals = false ;
2009-04-22 06:27:26 +08:00
2009-05-07 09:22:01 +08:00
@Argument ( fullName = "extended" , shortName = "ext" , doc = "extended" , required = false )
public boolean EXTENDED = false ;
2009-05-22 06:25:37 +08:00
2009-03-29 04:37:27 +08:00
public boolean FLAG_UNCOVERED_BASES = true ; // todo: how do I make this a command line argument?
2009-03-16 06:42:24 +08:00
public void initialize ( ) {
}
2009-04-04 03:54:54 +08:00
public Integer map ( RefMetaDataTracker tracker , char ref , LocusContext context ) {
2009-04-15 06:21:36 +08:00
ReadBackedPileup pileup = new ReadBackedPileup ( ref , context ) ;
2009-04-15 06:13:10 +08:00
String bases = pileup . getBases ( ) ;
2009-03-29 04:37:27 +08:00
if ( bases . equals ( "" ) & & FLAG_UNCOVERED_BASES ) {
2009-05-22 06:25:37 +08:00
bases = "***UNCOVERED_SITE***" ;
}
StringBuilder extras = new StringBuilder ( ) ;
String secondBasePileup = pileup . getSecondaryBasePileup ( ) ;
if ( secondBasePileup = = null & & alwaysShowSecondBase ) {
secondBasePileup = Utils . dupString ( 'N' , bases . length ( ) ) ;
2009-03-29 04:37:27 +08:00
}
2009-05-22 06:25:37 +08:00
if ( secondBasePileup ! = null ) extras . append ( " " ) . append ( secondBasePileup ) ;
2009-03-29 04:37:27 +08:00
2009-05-22 06:25:37 +08:00
if ( showSecondBaseQuals ) {
String secondQualPileup = pileup . getSecondaryQualPileup ( ) ;
if ( secondQualPileup = = null )
secondQualPileup = Utils . dupString ( ( char ) ( 33 ) , bases . length ( ) ) ;
extras . append ( " " ) . append ( secondQualPileup ) ;
2009-04-14 08:54:48 +08:00
}
2009-03-16 06:42:24 +08:00
String rodString = "" ;
2009-04-04 03:54:54 +08:00
for ( ReferenceOrderedDatum datum : tracker . getAllRods ( ) ) {
if ( datum ! = null & & ! ( datum instanceof rodDbSNP ) ) {
2009-04-15 06:13:10 +08:00
//System.out.printf("rod = %s%n", datum.toSimpleString());
2009-04-04 03:54:54 +08:00
rodString + = datum . toSimpleString ( ) ;
2009-04-15 06:13:10 +08:00
//System.out.printf("Rod string %s%n", rodString);
2009-03-16 06:42:24 +08:00
}
}
2009-04-15 06:13:10 +08:00
2009-04-04 03:54:54 +08:00
rodDbSNP dbsnp = ( rodDbSNP ) tracker . lookup ( "dbSNP" , null ) ;
if ( dbsnp ! = null )
rodString + = dbsnp . toMediumString ( ) ;
2009-03-16 06:42:24 +08:00
if ( rodString ! = "" )
rodString = "[ROD: " + rodString + "]" ;
2009-03-27 08:12:35 +08:00
//if ( context.getLocation().getStart() % 1 == 0 ) {
2009-04-15 06:13:10 +08:00
out . printf ( "%s%s %s%n" , pileup . getPileupString ( ) , extras , rodString ) ;
2009-03-27 08:12:35 +08:00
//}
2009-03-16 06:42:24 +08:00
2009-04-22 06:27:26 +08:00
if ( EXTENDED ) {
String probDists = pileup . getProbDistPileup ( ) ;
2009-05-02 05:40:46 +08:00
out . println ( probDists ) ;
2009-04-22 06:27:26 +08:00
}
2009-03-16 06:42:24 +08:00
return 1 ;
}
// Given result of map function
public Integer reduceInit ( ) { return 0 ; }
public Integer reduce ( Integer value , Integer sum ) {
2009-05-07 07:26:21 +08:00
return treeReduce ( sum , value ) ;
2009-05-02 05:40:46 +08:00
}
public Integer treeReduce ( Integer lhs , Integer rhs ) {
return lhs + rhs ;
2009-03-16 06:42:24 +08:00
}
}