From 3adb4239e453543d9bd189cfc6a407779180328f Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 10 Jun 2009 08:19:31 +0000 Subject: [PATCH] Same as regular Pileup, but also allows you to see flanking region around locus. This will be useful in determining that some SNPs are spurious due to being at the ends of homopolymer regions. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@959 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/PileupWithContextWalker.java | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/PileupWithContextWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWithContextWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWithContextWalker.java new file mode 100755 index 000000000..092f15f4e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWithContextWalker.java @@ -0,0 +1,146 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import net.sf.samtools.SAMRecord; +import net.sf.picard.reference.ReferenceSequence; + +import java.util.List; +import java.io.File; +import java.io.IOException; + +/** + * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] + * + * Print the alignment in the pileup format. In the pileup format, each line represents a genomic position, + * consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping + * qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all + * encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand, + * a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the + * reverse strand. + * + * A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next + * reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence. + * Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. + * Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read + * separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality. + * A symbol '$' marks the end of a read segment. + */ +public class PileupWithContextWalker extends LocusWalker implements TreeReducible { + @Argument(fullName="alwaysShowSecondBase",doc="If true, prints dummy bases for the second bases in the BAM file where they are missing",required=false) + public boolean alwaysShowSecondBase = false; + + //@Argument(fullName="showSecondBaseQuals",doc="If true, prints out second base qualities in the pileup",required=false) + //public boolean showSecondBaseQuals = false; + + @Argument(fullName="qualsAsInts",doc="If true, prints out qualities in the pileup as comma-separated integers",required=false) + public boolean qualsAsInts = false; + + @Argument(fullName="extended",shortName="ext",doc="extended",required=false) + public boolean EXTENDED = false; + + @Argument(fullName="flagUncoveredBases",shortName="fub",doc="Flag bases with zero coverage",required=false) + public boolean FLAG_UNCOVERED_BASES = true; + + @Argument(fullName="contextBases",shortName="cb",doc="How much context around the locus should we show?",required=false) + public int contextBases = 0; + + public IndexedFastaSequenceFile refSeq; + public ReferenceSequence contigSeq = null; + public String contig = null; + + public void initialize() { + File refFile = this.getToolkit().getArguments().referenceFile; + + try { + refSeq = new IndexedFastaSequenceFile(refFile); + } catch (IOException e) { + refSeq = null; + } + } + + public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { + ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + String bases = pileup.getBases(); + + if ( bases.equals("") && FLAG_UNCOVERED_BASES ) { + bases = "***UNCOVERED_SITE***"; + } + + StringBuilder extras = new StringBuilder(); + + String secondBasePileup = pileup.getSecondaryBasePileup(); + if ( secondBasePileup == null && alwaysShowSecondBase ) { + secondBasePileup = Utils.dupString('N', bases.length()); + } + if ( secondBasePileup != null ) extras.append(" ").append(secondBasePileup); + + /* + if ( showSecondBaseQuals ) { + String secondQualPileup = pileup.getSecondaryQualPileup(); + if ( secondQualPileup == null ) + secondQualPileup = Utils.dupString((char)(33), bases.length()); + extras.append(" ").append(secondQualPileup); + } + */ + if (contig == null || !context.getContig().equals(contig)) { + contig = context.getContig(); + contigSeq = refSeq.getSequence(contig); + } + + if (contextBases > 0 && refSeq != null) { + long startPos = context.getPosition() - contextBases <= 0 ? 1 : context.getPosition() - contextBases; + long stopPos = context.getPosition() + contextBases > contigSeq.length() ? contigSeq.length() : context.getPosition() + contextBases; + + ReferenceSequence prevRefSequence = refSeq.getSubsequenceAt(context.getContig(), startPos, context.getPosition()); + ReferenceSequence nextRefSequence = refSeq.getSubsequenceAt(context.getContig(), context.getPosition(), stopPos); + + extras.append(" prev=").append(new String(prevRefSequence.getBases())); + extras.append(" next=").append(new String(nextRefSequence.getBases())); + } + + String rodString = ""; + for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) { + if ( datum != null && ! (datum instanceof rodDbSNP)) { + //System.out.printf("rod = %s%n", datum.toSimpleString()); + rodString += datum.toSimpleString(); + //System.out.printf("Rod string %s%n", rodString); + } + } + + rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); + if ( dbsnp != null ) + rodString += dbsnp.toMediumString(); + + if ( rodString != "" ) + rodString = "[ROD: " + rodString + "]"; + + //if ( context.getLocation().getStart() % 1 == 0 ) { + //System.out.printf("quals as ints %b%n", qualsAsInts); + out.printf("%s%s %s%n", pileup.getPileupString(qualsAsInts), extras, rodString); + //} + + if ( EXTENDED ) { + String probDists = pileup.getProbDistPileup(); + out.println(probDists); + } + + return 1; + } + + // Given result of map function + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { + return treeReduce(sum,value); + } + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } +} \ No newline at end of file