diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 26ff4db24..311d66d81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -53,13 +53,14 @@ public class ErrorModel { PairHMMIndelErrorModel pairModel = null; LinkedHashMap haplotypeMap = null; - HashMap> indelLikelihoodMap = null; double[][] perReadLikelihoods = null; double[] model = new double[maxQualityScore+1]; Arrays.fill(model,Double.NEGATIVE_INFINITY); boolean hasCalledAlleles = false; + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); if (refSampleVC != null) { for (Allele allele : refSampleVC.getAlleles()) { @@ -72,7 +73,6 @@ public class ErrorModel { if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - indelLikelihoodMap = new HashMap>(); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } @@ -92,12 +92,12 @@ public class ErrorModel { Allele refAllele = refSampleVC.getReference(); - if (refSampleVC.isIndel()) { + if ( refSampleVC.isIndel()) { final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); if (!haplotypeMap.isEmpty()) - perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); + perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); } int idx = 0; for (PileupElement refPileupElement : refSamplePileup) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index f6ce818be..4c20700ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -41,15 +41,6 @@ import java.util.*; public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - //protected Set laneIDs; - public enum Model { - SNP, - INDEL, - POOLSNP, - POOLINDEL, - BOTH - } - final protected UnifiedArgumentCollection UAC; protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { @@ -203,7 +194,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { HashMap perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts); if (perLaneErrorModels == null && UAC.referenceSampleName != null) @@ -215,8 +207,11 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G newContext.put(DUMMY_SAMPLE_NAME,mergedContext); contexts = newContext; } - - // get initial alleles to genotype + if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { + // starting a new site: clear allele list + perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods + } + // get initial alleles to genotype final List allAlleles = new ArrayList(); if (allAllelesToUse == null || allAllelesToUse.isEmpty()) allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse)); @@ -234,9 +229,13 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G continue; ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ + // no likelihoods have been computed for this sample at this site + perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap()); + } // create the GenotypeLikelihoods object - final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO); + final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey())); // actually compute likelihoods final int nGoodBases = GL.add(pileup, UAC); if ( nGoodBases > 0 ) @@ -333,7 +332,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, - final boolean ignoreLaneInformation); + final boolean ignoreLaneInformation, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap); protected abstract List getInitialAllelesToUse(final RefMetaDataTracker tracker, final ReferenceContext ref, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index 34267b9a8..ac212cfb5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -26,6 +26,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype double[][] readHaplotypeLikelihoods; final byte refBase; + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap; public GeneralPloidyIndelGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, @@ -34,7 +35,8 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype final boolean ignoreLaneInformation, final PairHMMIndelErrorModel pairModel, final LinkedHashMap haplotypeMap, - final ReferenceContext referenceContext) { + final ReferenceContext referenceContext, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); this.pairModel = pairModel; this.haplotypeMap = haplotypeMap; @@ -42,6 +44,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles); // todo - not needed if indel alleles have base at current position this.refBase = referenceContext.getBase(); + this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap; } // ------------------------------------------------------------------------------------- @@ -142,8 +145,9 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype List numSeenBases = new ArrayList(this.alleles.size()); if (!hasReferenceSampleData) { + final int readCounts[] = new int[pileup.getNumberOfElements()]; - readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts); + readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); n = readHaplotypeLikelihoods.length; } else { Allele refAllele = null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index f6559f666..fc0c526bc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -73,8 +73,9 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, - final boolean ignoreLaneInformation){ - return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref); + final boolean ignoreLaneInformation, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ + return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref, perReadAlleleLikelihoodMap); } protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, @@ -90,7 +91,6 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE) alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE); if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { - IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear(); haplotypeMap.clear(); } IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index 30d614455..4376ec601 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -49,7 +49,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, - final boolean ignoreLaneInformation) { + final boolean ignoreLaneInformation, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java index 0890ac20c..287acafb3 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java @@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.jgrapht.graph.DefaultDirectedGraph; +import java.io.Serializable; +import java.util.Comparator; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -9,7 +12,7 @@ import org.jgrapht.graph.DefaultDirectedGraph; */ // simple edge class for connecting nodes in the graph -public class DeBruijnEdge implements Comparable { +public class DeBruijnEdge { private int multiplicity; private boolean isRef; @@ -53,8 +56,10 @@ public class DeBruijnEdge implements Comparable { return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); } - @Override - public int compareTo( final DeBruijnEdge that ) { - return this.multiplicity - that.multiplicity; + public static class EdgeWeightComparator implements Comparator, Serializable { + @Override + public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) { + return edge1.multiplicity - edge2.multiplicity; + } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 39833613d..4da3251bc 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -14,7 +14,7 @@ public class DeBruijnVertex { public final int kmer; public DeBruijnVertex( final byte[] sequence, final int kmer ) { - this.sequence = sequence; + this.sequence = sequence.clone(); this.kmer = kmer; } @@ -37,7 +37,7 @@ public class DeBruijnVertex { } public byte[] getSequence() { - return sequence; + return sequence.clone(); } public byte[] getSuffix() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 6afdc58ea..9de9b3292 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -42,22 +42,24 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final int MNP_LOOK_AHEAD; private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { this.DEBUG = DEBUG; - this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD; this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } // This function is the streamlined approach, currently not being used @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, final byte[] ref, final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) { + public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, + final ArrayList haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser ) { // Prepare the list of haplotype indices to genotype final ArrayList allelesToGenotype = new ArrayList(); @@ -116,7 +118,7 @@ public class GenotypingEngine { System.out.println( "> Cigar = " + h.getCigar() ); } // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) ); + h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); startPosKeySet.addAll(h.getEventMap().keySet()); } @@ -199,7 +201,7 @@ public class GenotypingEngine { if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) ); + h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { System.out.println( h.toString() ); @@ -224,7 +226,6 @@ public class GenotypingEngine { } } - // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { @@ -518,11 +519,7 @@ public class GenotypingEngine { return false; } - protected static HashMap generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) { - return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code - } - - protected static HashMap generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) { + protected static HashMap generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { final HashMap vcs = new HashMap(); int refPos = alignmentStartHapwrtRef; @@ -533,24 +530,36 @@ public class GenotypingEngine { final int elementLength = ce.getLength(); switch( ce.getOperator() ) { case I: + { final ArrayList insertionAlleles = new ArrayList(); final int insertionStart = refLoc.getStart() + refPos - 1; - insertionAlleles.add( Allele.create(ref[refPos-1], true) ); - if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } + if( (haplotype.leftBreakPoint != 0 || haplotype.rightBreakPoint != 0) && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); } else { byte[] insertionBases = new byte[]{}; insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); - insertionAlleles.add( Allele.create(insertionBases, false) ); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); } - vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); alignmentPos += elementLength; break; + } case S: + { alignmentPos += elementLength; break; + } case D: + { final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base final ArrayList deletionAlleles = new ArrayList(); final int deletionStart = refLoc.getStart() + refPos - 1; @@ -561,51 +570,36 @@ public class GenotypingEngine { // deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); // vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make()); //} else { + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(ref[refPos-1], false) ); + deletionAlleles.add( Allele.create(refByte, false) ); vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } //} refPos += elementLength; break; + } case M: case EQ: case X: - int numSinceMismatch = -1; - int stopOfMismatch = -1; - int startOfMismatch = -1; - int refPosStartOfMismatch = -1; + { for( int iii = 0; iii < elementLength; iii++ ) { - if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) { - // SNP or start of possible MNP - if( stopOfMismatch == -1 ) { - startOfMismatch = alignmentPos; - stopOfMismatch = alignmentPos; - numSinceMismatch = 0; - refPosStartOfMismatch = refPos; - } else { - stopOfMismatch = alignmentPos; + final byte refByte = ref[refPos]; + final byte altByte = alignment[alignmentPos]; + if( refByte != altByte ) { // SNP! + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { + final ArrayList snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refByte, true ) ); + snpAlleles.add( Allele.create( altByte, false ) ); + vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); } } - if( stopOfMismatch != -1) { - numSinceMismatch++; - } - if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) { - final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 ); - final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 ); - final ArrayList snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refBases, true ) ); - snpAlleles.add( Allele.create( mismatchBases, false ) ); - final int snpStart = refLoc.getStart() + refPosStartOfMismatch; - vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make()); - numSinceMismatch = -1; - stopOfMismatch = -1; - startOfMismatch = -1; - refPosStartOfMismatch = -1; - } refPos++; alignmentPos++; } break; + } case N: case H: case P: diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ec4fb3950..acb5c9ebe 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.commandline.*; @@ -44,10 +46,6 @@ import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -121,10 +119,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; - @Hidden - @Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false) - protected int MNP_LOOK_AHEAD = 0; - @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; @@ -137,7 +131,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; @Advanced - @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Gap continuation penalty for use in the Pair HMM", required = false) + @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) @@ -180,7 +174,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * so annotations will be excluded even if they are explicitly included with the other options. */ @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"})); + protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. @@ -189,23 +183,23 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected String[] annotationClassesToUse = { "Standard" }; @ArgumentCollection - private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + // the UG engines + private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; + // the assembly engine - LocalAssemblyEngine assemblyEngine = null; + private LocalAssemblyEngine assemblyEngine = null; // the likelihoods engine - LikelihoodCalculationEngine likelihoodCalculationEngine = null; + private LikelihoodCalculationEngine likelihoodCalculationEngine = null; // the genotyping engine - GenotypingEngine genotypingEngine = null; + private GenotypingEngine genotypingEngine = null; // the annotation engine private VariantAnnotatorEngine annotationEngine; @@ -240,7 +234,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model - UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling @@ -291,7 +285,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); - genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } //--------------------------------------------------------------------------------------------------------------- @@ -413,45 +407,48 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for( final Pair>> callResult : ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() ) + ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() ) : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } - final Map>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); + final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); - - // add some custom annotations to the calls final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } + + if( !GENOTYPE_FULL_ACTIVE_REGION ) { + // add some custom annotations to the calls + + // Calculate the number of variants on the haplotype + int maxNumVar = 0; + for( final Allele allele : callResult.getFirst().getAlleles() ) { + if( !allele.isReference() ) { + for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { + final int numVar = haplotype.getEventMap().size(); + if( numVar > maxNumVar ) { maxNumVar = numVar; } + } } } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } + // Calculate the event length + int maxLength = 0; + for ( final Allele a : annotatedCall.getAlternateAlleles() ) { + final int length = a.length() - annotatedCall.getReference().length(); + if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } + } - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); + myAttributes.put("NVH", maxNumVar); + myAttributes.put("NumHapEval", bestHaplotypes.size()); + myAttributes.put("NumHapAssembly", haplotypes.size()); + myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); + myAttributes.put("EVENTLENGTH", maxLength); + myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); + myAttributes.put("extType", annotatedCall.getType().toString() ); - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); + //if( likelihoodCalculationEngine.haplotypeScore != null ) { + // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); + //} + if( annotatedCall.hasAttribute("QD") ) { + myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); + } } vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index 38f11cc5d..8a401439b 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; @@ -337,8 +338,8 @@ public class HaplotypeResolver extends RodWalker { } // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0)); + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2)); if ( source1Map.size() == 0 || source2Map.size() == 0 ) { // TODO -- handle errors appropriately logger.debug("No source alleles; aborting at " + refContext.getLocus()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 0ef1a13a4..f7575439b 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -4,6 +4,7 @@ import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.jgrapht.graph.DefaultDirectedGraph; +import java.io.Serializable; import java.util.*; /** @@ -76,13 +77,15 @@ public class KBestPaths { } } - protected static class PathComparatorTotalScore implements Comparator { + protected static class PathComparatorTotalScore implements Comparator, Serializable { + @Override public int compare(final Path path1, final Path path2) { return path1.totalScore - path2.totalScore; } } - //protected static class PathComparatorLowestEdge implements Comparator { + //protected static class PathComparatorLowestEdge implements Comparator, Serializable { + // @Override // public int compare(final Path path1, final Path path2) { // return path2.lowestEdge - path1.lowestEdge; // } @@ -124,7 +127,7 @@ public class KBestPaths { // recursively run DFS final ArrayList edgeArrayList = new ArrayList(); edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex)); - Collections.sort(edgeArrayList); + Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator()); Collections.reverse(edgeArrayList); for ( final DeBruijnEdge edge : edgeArrayList ) { // make sure the edge is not already in the path diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index fabf5633f..d04c1a9e2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -77,10 +78,10 @@ public class LikelihoodCalculationEngine { PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); // for each sample's reads - for( final String sample : perSampleReadList.keySet() ) { + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); } } @@ -179,7 +180,6 @@ public class LikelihoodCalculationEngine { final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // log10(10^(a*x1) + 10^(b*x2)) ??? // First term is approximated by Jacobian log with table lookup. haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); } @@ -323,11 +323,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } - public static Map>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { - final Map>> returnMap = new HashMap>>(); + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { + final Map returnMap = new HashMap(); final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - final Map> alleleReadMap = new HashMap>(); + //final Map> alleleReadMap = new HashMap>(); + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + final ArrayList readsForThisSample = sample.getValue(); for( int iii = 0; iii < readsForThisSample.size(); iii++ ) { final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! @@ -335,51 +337,31 @@ public class LikelihoodCalculationEngine { if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { final double likelihoods[] = new double[call.getFirst().getAlleles().size()]; int count = 0; - for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood - double maxLikelihood = Double.NEGATIVE_INFINITY; + + for( final Allele a : call.getFirst().getAlleles() ) { for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) final double likelihood = h.getReadLikelihoods(sample.getKey())[iii]; - if( likelihood > maxLikelihood ) { - maxLikelihood = likelihood; - } - } - likelihoods[count++] = maxLikelihood; - } - final int bestAllele = MathUtils.maxElementIndex(likelihoods); - final double bestLikelihood = likelihoods[bestAllele]; - Allele allele = Allele.NO_CALL; - boolean isInformativeRead = false; - for( final double likelihood : likelihoods ) { - if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) { - isInformativeRead = true; - break; + likelihoodMap.add(read, a, likelihood); } } - // uninformative reads get the no call Allele - if( isInformativeRead ) { - allele = call.getFirst().getAlleles().get(bestAllele); - } - List readList = alleleReadMap.get(allele); - if( readList == null ) { - readList = new ArrayList(); - alleleReadMap.put(allele, readList); - } - readList.add(read); } } - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods +/* // add all filtered reads to the NO_CALL list because they weren't given any likelihoods List readList = alleleReadMap.get(Allele.NO_CALL); if( readList == null ) { readList = new ArrayList(); alleleReadMap.put(Allele.NO_CALL, readList); } - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + */ + /* for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { readList.add(read); } } - returnMap.put(sample.getKey(), alleleReadMap); + */ + returnMap.put(sample.getKey(), likelihoodMap); + } return returnMap; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 56cb6c3d4..a7a1630d4 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -201,7 +201,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // compute mean number of reduced read counts in current kmer span final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1); // precise rounding can make a difference with low consensus counts - countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length); + countNumber = MathUtils.arrayMax(counts); + // countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length); } if( !badKmer ) { @@ -292,7 +293,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present - final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place + final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 6ae34f190..b5b0abc6e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -18,8 +18,8 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; final String REFSAMPLE_NAME = "NA12878"; - final String MTINTERVALS = "MT:1-3000"; - final String LSVINTERVALS = "20:40,000,000-41,000,000"; + final String MTINTERVALS = "MT:1-1000"; + final String LSVINTERVALS = "20:40,500,000-41,000,000"; final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; @@ -47,31 +47,31 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","077db83cf7dc5490f670c85856b408b2"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","e460a17377b731ff4eab36fb56042ecd"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9514ed15c7030b6d47e04e6a3a2b0a3e"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","26598044436c8044f22ffa767b06a0f0"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da359fe7dd6dce045193198c264301ee"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "ad0eef3a9deaa098d79df62af7e5448a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 539190fe9..07e7b0d92 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -142,7 +142,6 @@ public class GenotypingEngineUnitTest extends BaseTest { byte[] ref; byte[] hap; HashMap expected; - GenotypingEngine ge = new GenotypingEngine(false, 0, false); public BasicGenotypingTestProvider(String refString, String hapString, HashMap expected) { super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString)); @@ -153,7 +152,7 @@ public class GenotypingEngineUnitTest extends BaseTest { public HashMap calcAlignment() { final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); - return ge.generateVCsFromAlignment( alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name", 0); + return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c766f363c..2ae1f2ca5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -8,9 +8,10 @@ import java.util.Arrays; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; - //final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr"; private void HCTest(String bam, String args, String md5) { final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; @@ -20,28 +21,49 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "6b30c7e1b6bbe80d180d9d67441cec12"); + HCTest(CEUTRIO_BAM, "", "e5b4a0627a1d69b9356f8a7cd2260e89"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "4cdfbfeadef00725974828310558d7d4"); + HCTest(NA12878_BAM, "", "202d5b6edaf74f411c170099749f202f"); } @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "6183fb6e374976d7087150009685e043"); + HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "561931ba3919808ec471e745cb3148c7"); } private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "ab7593a7a60a2e9a66053572f1718df1"); + HCTestComplexVariants(CEUTRIO_BAM, "", "3424b398a9f47c8ac606a5c56eb7d8a7"); + } + + private void HCTestSymbolicVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerSingleSampleSymbolic() { + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b71cfaea9390136c584c9671b149d573"); + } + + private void HCTestIndelQualityScores(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerSingleSampleIndelQualityScores() { + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970"); } } - diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index bbbd96cf1..f66e229bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -64,12 +64,35 @@ public class GATKArgumentCollection { @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) public File gatkKeyFile = null; + /** + * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be + * used to group together runs during later analysis. One use of this capability is to tag runs as GATK + * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // XXX + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java new file mode 100644 index 000000000..f30fc0316 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.arguments; + +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 8/20/12 + * A collection of arguments that are common to the various callers. + * This is pulled out so that every caller isn't exposed to the arguments from every other caller. + */ + +public class StandardCallerArgumentCollection { + /** + * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: + * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 + */ + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) + public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) + public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) + public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + + /** + * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with + * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this + * is the default). + */ + @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) + public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + + /** + * This argument allows you to emit low quality calls as filtered records. + */ + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + + /** + * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding + */ + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) + public RodBinding alleles; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. + */ + @Advanced + @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) + public int MAX_ALTERNATE_ALLELES = 3; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index f97069189..75af7976f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -301,14 +301,15 @@ public class LocusIteratorByState extends LocusIterator { final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator - final int readOffset = state.getReadOffset(); // the base offset on this read + int readOffset = state.getReadOffset(); // the base offset on this read final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; - final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); int nextElementLength = nextElement.getLength(); @@ -328,8 +329,13 @@ public class LocusIteratorByState extends LocusIterator { else { if (!filterBaseInRead(read, location.getStart())) { String insertedBaseString = null; - if (nextOp == CigarOperator.I) - insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength())); + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); size++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index b60a7845a..035252c14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -138,6 +138,9 @@ public class GATKRunReport { @Element(required = true, name = "iterations") private long nIterations; + @Element(required = true, name = "tag") + private String tag; + public enum PhoneHomeOption { /** Disable phone home */ NO_ET, @@ -186,6 +189,8 @@ public class GATKRunReport { nIterations = engine.getCumulativeMetrics().getNumIterations(); } + tag = engine.getArguments().tag; + // user and hostname -- information about the runner of the GATK userName = System.getProperty("user.name"); hostName = Utils.resolveHostname(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 979e0f2d6..67de427e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -31,7 +32,7 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); @Override @@ -110,18 +111,18 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if( !activeRegions.isEmpty() ) { if( !workQueue.isEmpty() ) { - final org.broadinstitute.sting.utils.activeregion.ActiveRegion last = workQueue.getLast(); - final org.broadinstitute.sting.utils.activeregion.ActiveRegion first = activeRegions.get(0); + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new org.broadinstitute.sting.utils.activeregion.ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); } } workQueue.addAll( activeRegions ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 30f81b20c..a68f0df21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -51,7 +52,12 @@ public class AlleleBalance extends InfoFieldAnnotation { char[] BASES = {'A','C','G','T'}; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 11c9c3a99..0104f24d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -24,7 +25,14 @@ import java.util.List; */ public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { - public void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, final GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java index c3b6de65a..3cbca4f52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -52,7 +53,12 @@ import java.util.Map; */ public class BaseCounts extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index bd884892c..dc727fa48 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -21,66 +23,37 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if( isUsableBase(p) ) { - if ( p.getBase() == ref ) - refQuals.add((double)p.getQual()); - else if ( alts.contains(p.getBase()) ) - altQuals.add((double)p.getQual()); - } - } - } - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - // TODO -- implement me; how do we pull out the correct offset from the read? - return; - -/* - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { + protected void fillQualsFromPileup(final List allAlleles, final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, final List altQuals){ + if (alleleLikelihoodMap == null) { + // use fast SNP-based version if we don't have per-read allele likelihoods + for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { - if ( matchesRef ) + if ( allAlleles.get(0).equals(Allele.create(p.getBase(),true)) ) { refQuals.add((double)p.getQual()); - else + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { altQuals.add((double)p.getQual()); - } - } - } -*/ - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry entry : el.entrySet()) { - - if (entry.getKey().isReference()) - refLikelihood = entry.getValue(); - else { - double like = entry.getValue(); - if (like >= altLikelihood) - altLikelihood = like; } } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add(-10.0*refLikelihood); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add(-10.0*altLikelihood); } + return; + } + + for (Map el : alleleLikelihoodMap.getLikelihoodMapValues()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add(-10.0*(double)el.get(a)); + else if (allAlleles.contains(a)) + altQuals.add(-10.0*(double)el.get(a)); + + } } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 54837baad..4ae1a0bba 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -61,7 +62,12 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn private Set founderIds = new HashSet(); - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { if ( ! vc.hasGenotypes() ) return null; @@ -73,13 +79,6 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn founderIds = ((Walker)walker).getSampleDB().getFounderIds(); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( ! vc.hasGenotypes() ) - return null; - - return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); - } - public List getKeyNames() { return Arrays.asList(keyNames); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index f41a40621..1fd220f2f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -1,10 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -24,68 +22,26 @@ public class ClippingRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - return; - // This working implementation below needs to be tested for the UG pipeline - /* - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - if ( p.getBase() == ref ) { - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } else if ( alts.contains(p.getBase()) ) { - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } - } - } - */ - } - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap likelihoodMap, final List refQuals, final List altQuals) { + // todo - only support non-pileup case for now, e.g. active-region based version + if (pileup != null || likelihoodMap == null) + return; + + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); + else if (allAlleles.contains(a)) + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - if ( matchesRef ) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); - else - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); - } } } - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - return; - // This working implementation below needs to be tested for the UG pipeline - - /* - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Allele a : el.keySet()) { - - if (a.isReference()) - refLikelihood =el.get(a); - else { - double like = el.get(a); - if (like >= altLikelihood) - altLikelihood = like; - } - } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } - } - */ - } -} + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 28ca77f18..ec3f1e5c7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; @@ -22,44 +23,37 @@ import java.util.Map; /** * Total (unfiltered) depth over all samples. * - * This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample - * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal - * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site. - * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the - * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the - * power I have to determine the genotype of the sample at this site, while the AD tells me how many times - * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering - * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like - * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would - * normally be excluded from the statistical calculations going into GQ and QUAL. - * - * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with - * -dcov D is N * D + * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's + * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth + * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for + * N samples with -dcov D is N * D */ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { int depth = 0; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().getBasePileup().depthOfCoverage(); - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%d", depth)); - return map; - } + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - int depth = 0; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final List alleleBin : alleleBins.values() ) { - depth += alleleBin.size(); - } + for ( Map.Entry sample : stratifiedContexts.entrySet() ) + depth += sample.getValue().getBasePileup().depthOfCoverage(); } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for ( Map.Entry sample : perReadAlleleLikelihoodMap.entrySet() ) + depth += sample.getValue().getNumberOfStoredElements(); + } + else + return null; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%d", depth)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index a9edab752..320fe3148 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -6,11 +6,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -19,40 +21,49 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; /** * The depth of coverage of each VCF allele in this sample. * - * This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample - * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal - * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site. - * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the + * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this + * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the + * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of + * REF and ALT fields) is the unfiltered count of all reads that carried with them the * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the * power I have to determine the genotype of the sample at this site, while the AD tells me how many times * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that - * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that - * are actually present and correctly left-aligned in the alignments themselves). Because of this fact and - * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base - * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what - * determine the genotype calls (see below). + * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. + * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are + * many non-informatice reads. + * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation, + * one should not base assumptions about the underlying genotype based on it; + * instead, the genotype likelihoods (PLs) are what determine the genotype calls. */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { - public void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { if ( g == null || !g.isCalled() ) return; - if ( vc.isSNP() ) - annotateSNP(stratifiedContext, vc, gb); - else if ( vc.isIndel() ) - annotateIndel(stratifiedContext, ref.getBase(), vc, gb); + if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) + annotateWithLikelihoods(alleleLikelihoodMap, vc, gb); + else if ( stratifiedContext != null && (vc.isSNP())) + annotateWithPileup(stratifiedContext, vc, gb); } - private void annotateSNP(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { + private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { HashMap alleleCounts = new HashMap(); for ( Allele allele : vc.getAlleles() ) @@ -73,48 +84,29 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa gb.AD(counts); } - private void annotateIndel(final AlignmentContext stratifiedContext, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) { - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - if ( pileup == null ) - return; - + private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) { final HashMap alleleCounts = new HashMap(); - final Allele refAllele = vc.getReference(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } + for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (!vc.getAlleles().contains(a)) + continue; // sanity check - shouldn't be needed + alleleCounts.put(a,alleleCounts.get(a)+1); - for ( PileupElement p : pileup ) { - if ( p.isBeforeInsertion() ) { - - final Allele insertion = Allele.create((char)refBase + p.getEventBases(), false); - if ( alleleCounts.containsKey(insertion) ) { - alleleCounts.put(insertion, alleleCounts.get(insertion)+1); - } - - } else if ( p.isBeforeDeletionStart() ) { - if ( p.getEventLength() == refAllele.length() - 1 ) { - // this is indeed the deletion allele recorded in VC - final Allele deletion = Allele.create(refBase); - if ( alleleCounts.containsKey(deletion) ) { - alleleCounts.put(deletion, alleleCounts.get(deletion)+1); - } - } - } else if ( p.getRead().getAlignmentEnd() > vc.getStart() ) { - alleleCounts.put(refAllele, alleleCounts.get(refAllele)+1); - } } - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(refAllele); + counts[0] = alleleCounts.get(vc.getReference()); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); gb.AD(counts); } - // public String getIndelBases() public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } public List getDescriptions() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 131670599..ad0ad50b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -54,21 +55,30 @@ import java.util.*; public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; - - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isVariant() ) return null; int[][] table; - if ( vc.isSNP() ) + if (vc.isSNP() && stratifiedContexts != null) { table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - else if ( vc.isIndel() || vc.isMixed() ) { - table = getIndelContingencyTable(stratifiedContexts); - if (table == null) - return null; + } + else if (stratifiedPerReadAlleleLikelihoodMap != null) { + // either SNP with no alignment context, or indels: per-read likelihood map needed + table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); } else + // for non-snp variants, we need per-read likelihoods. + // for snps, we can get same result from simple pileup + return null; + + if (table == null) return null; Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); @@ -80,22 +90,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( !vc.isVariant() ) - return null; - - final int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - - final Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); - if ( pvalue == null ) - return null; - - final Map map = new HashMap(); - map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); - return map; - - } - public List getKeyNames() { return Arrays.asList(FS); } @@ -161,7 +155,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat table[0][1] += 1; table[1][1] -= 1; - return (table[0][0] >= 0 && table[1][1] >= 0) ? true : false; + return (table[0][0] >= 0 && table[1][1] >= 0); } private static boolean unrotateTable(int[][] table) { @@ -171,7 +165,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat table[0][1] -= 1; table[1][1] += 1; - return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; + return (table[0][1] >= 0 && table[1][0] >= 0); } private static double computePValue(int[][] table) { @@ -218,31 +212,31 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, + final Allele ref, final Allele alt) { int[][] table = new int[2][2]; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + if ( el.getKey().isReducedRead() ) // ignore reduced reads + continue; + final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); + final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alt.equals(alleleBin.getKey()); if ( !matchesRef && !matchesAlt ) continue; - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - boolean isFW = read.getReadNegativeStrandFlag(); + boolean isFW = el.getKey().getReadNegativeStrandFlag(); - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; - table[row][column]++; - } + table[row][column]++; } } return table; } - /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -275,69 +269,5 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - private static int[][] getIndelContingencyTable(Map stratifiedContexts) { - final double INDEL_LIKELIHOOD_THRESH = 0.3; - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - if (indelLikelihoodMap == null) - return null; - - int[][] table = new int[2][2]; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - final AlignmentContext context = sample.getValue(); - if ( context == null ) - continue; - - final ReadBackedPileup pileup = context.getBasePileup(); - for ( final PileupElement p : pileup ) { - if ( ! RankSumTest.isUsableBase(p, true) || p.getRead().isReducedRead() ) // ignore reduced reads - continue; - if ( indelLikelihoodMap.containsKey(p) ) { - // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. - // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). - // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. - // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pileup element is "ref" - // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry entry : el.entrySet()) { - - if (entry.getKey().isReference()) - refLikelihood = entry.getValue(); - else { - double like = entry.getValue(); - if (like >= altLikelihood) - altLikelihood = like; - } - } - - boolean matchesRef = (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)); - boolean matchesAlt = (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)); - if ( matchesRef || matchesAlt ) { - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - - table[row][column]++; - } - - - } - } - } - - return table; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index fba30b3f7..3fe5c5837 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -25,7 +26,12 @@ import java.util.Map; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { double content = computeGCContent(ref); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", content)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index c6d8883c5..6487eac50 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -28,10 +28,12 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -47,6 +49,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; /** @@ -54,17 +57,31 @@ import java.util.*; * are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls. * Note that the Haplotype Score is only calculated for sites with read coverage. */ -public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { +public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if (vc.isSNP() && stratifiedContexts != null) + return annotatePileup(ref, stratifiedContexts, vc); + else if (stratifiedPerReadAlleleLikelihoodMap != null && vc.isVariant()) + return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); + else return null; + } - if (!vc.isSNP() && !vc.isIndel() && !vc.isMixed()) + private Map annotatePileup(final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + + if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); @@ -85,14 +102,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); if (thisContext != null) { final ReadBackedPileup thisPileup = thisContext.getBasePileup(); - if (vc.isSNP()) - scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - else if (vc.isIndel() || vc.isMixed()) { - Double d = scoreIndelsAgainstHaplotypes(thisPileup); - if (d == null) - return null; - scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - } + scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense } } } @@ -103,7 +113,32 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return map; } - private static class HaplotypeComparator implements Comparator { + private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, + final VariantContext vc) { + + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + for (final Genotype genotype : vc.getGenotypes()) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoodMap == null) + continue; + + Double d = scoreIndelsAgainstHaplotypes(perReadAlleleLikelihoodMap); + if (d == null) + continue; + scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + + // if (scoreRA.observationCount() == 0) + // return null; + + // annotate the score in the info field + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean())); + return map; + + } + + private static class HaplotypeComparator implements Comparator, Serializable { public int compare(Haplotype a, Haplotype b) { if (a.getQualitySum() < b.getQualitySum()) @@ -177,7 +212,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); @@ -189,7 +223,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); + final int readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { @@ -346,31 +380,26 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { + private Double scoreIndelsAgainstHaplotypes(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final ArrayList haplotypeScores = new ArrayList(); - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - - if (indelLikelihoodMap == null) + if (perReadAlleleLikelihoodMap.isEmpty()) return null; - for (final PileupElement p : pileup) { - if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); + for (Map el : perReadAlleleLikelihoodMap.getLikelihoodMapValues()) { - // Score all the reads in the pileup, even the filtered ones - final double[] scores = new double[el.size()]; - int i = 0; - for (Map.Entry a : el.entrySet()) { - scores[i++] = -a.getValue(); - if (DEBUG) { - System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); - } + // retrieve likelihood information corresponding to this read + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[el.size()]; + int i = 0; + for (Map.Entry a : el.entrySet()) { + scores[i++] = -a.getValue(); + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); } - - haplotypeScores.add(scores); } + + haplotypeScores.add(scores); } // indel likelihoods are strict log-probs, not phred scored diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index 6ba85de07..06fa04526 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -29,7 +30,12 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress private static final int MIN_GENOTYPE_QUALITY = 10; private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index 9f20bf375..5891cbc69 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -22,7 +23,12 @@ public class HomopolymerRun extends InfoFieldAnnotation { private boolean ANNOTATE_INDELS = true; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isBiallelic() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 715895526..64be64afa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -33,17 +34,18 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private static final int MIN_SAMPLES = 10; private Set founderIds; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { //If available, get the founder IDs and cache them. the IC will only be computed on founders then. - if(founderIds == null) + if(founderIds == null && walker != null) founderIds = ((Walker)walker).getSampleDB().getFounderIds(); return calculateIC(vc); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - return calculateIC(vc); - } - private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index babaf7ee6..5f405cb46 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.IndelUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -18,9 +19,14 @@ import java.util.*; */ public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { - int run; + int run; if (vc.isMixed()) { Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%s", "MIXED")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java index 7f5033adf..4be601bc8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -21,7 +22,12 @@ import java.util.Map; */ public class LowMQ extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b6f24433e..8aa961c75 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -28,27 +30,45 @@ import java.util.*; public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; - private String motherId; - private String fatherId; - private String childId; + private Set trios; + private class Trio { + String motherId; + String fatherId; + String childId; + } - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( mendelianViolation == null ) { if (checkAndSetSamples(((Walker) walker).getSampleDB())) { mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && - vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && - vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); - if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); + //double pNoMV = 1.0; + double maxMVLR = Double.MIN_VALUE; + for ( Trio trio : trios ) { + boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() && + vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() && + vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods(); + if ( hasAppropriateGenotypes ) { + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId); + maxMVLR = likR > maxMVLR ? likR : maxMVLR; + //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); + } + } + //double pSomeMV = 1.0-pNoMV; + //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); + toRet.put("MVLR",maxMVLR); return toRet; } @@ -58,25 +78,24 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } private boolean checkAndSetSamples(SampleDB db){ + trios = new HashSet(); Set families = db.getFamilyIDs(); - if(families.size() != 1) - return false; - - Set family = db.getFamily(families.iterator().next()); - if(family.size() != 3) - return false; - - Iterator sampleIter = family.iterator(); - Sample sample; - for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ - if(sample.getParents().size()==2){ - motherId = sample.getMaternalID(); - fatherId = sample.getPaternalID(); - childId = sample.getID(); - return true; + for ( String familyString : families ) { + Set family = db.getFamily(familyString); + Iterator sampleIterator = family.iterator(); + Sample sample; + for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) { + if ( sample.getParents().size() == 2 ) { + Trio trio = new Trio(); + trio.childId = sample.getID(); + trio.fatherId = sample.getFather().getID(); + trio.motherId = sample.getMother().getID(); + trios.add(trio); + } } } - return false; + + return trios.size() > 0; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 31067e386..6557f3e47 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -2,11 +2,13 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -23,60 +25,36 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - if ( p.getBase() == ref ) { - refQuals.add((double)p.getMappingQual()); - } else if ( alts.contains(p.getBase()) ) { - altQuals.add((double)p.getMappingQual()); - } - } - } - } + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap likelihoodMap, + final List refQuals, final List altQuals) { - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - if ( matchesRef ) - refQuals.add((double)read.getMappingQuality()); - else - altQuals.add((double)read.getMappingQuality()); - } - } - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry a : el.entrySet()) { - - if (a.getKey().isReference()) - refLikelihood = a.getValue(); - else { - double like = a.getValue(); - if (like >= altLikelihood) - altLikelihood = like; + if (pileup != null && likelihoodMap == null) { + // no per-read likelihoods available: + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { + refQuals.add((double)p.getMappingQual()); + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { + altQuals.add((double)p.getMappingQual()); } } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add((double)p.getMappingQual()); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add((double)p.getMappingQual()); } + return; + } + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)el.getKey().getMappingQuality()); + else if (allAlleles.contains(a)) + altQuals.add((double)el.getKey().getMappingQuality()); + + } } - -} \ No newline at end of file + + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index 372d5bc9e..74f9c0d0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -3,14 +3,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -22,9 +26,25 @@ import java.util.Map; /** * Total count across all samples of mapping quality zero reads */ -public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation { +public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if ((vc.isSNP() || !vc.isVariant()) && stratifiedContexts != null) + return annotatePileup(ref, stratifiedContexts, vc); + else if (stratifiedPerReadAlleleLikelihoodMap != null && vc.isVariant()) + return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); + else + return null; + } + + private Map annotatePileup(final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; @@ -42,6 +62,25 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA return map; } + private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, + final VariantContext vc) { + if (stratifiedPerReadAlleleLikelihoodMap == null) + return null; + + int mq0 = 0; + for ( PerReadAlleleLikelihoodMap likelihoodMap : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (GATKSAMRecord read : likelihoodMap.getLikelihoodReadMap().keySet()) { + + if (read.getMappingQuality() == 0 ) + mq0++; + } + } + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", mq0)); + return map; + } + + public List getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); } public List getDescriptions() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index b5252f15b..354b798bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -46,14 +47,19 @@ import java.util.List; * Count for each sample of mapping quality zero reads */ public class MappingQualityZeroBySample extends GenotypeAnnotation { - public void annotate(RefMetaDataTracker tracker, - AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext context, - VariantContext vc, Genotype g, GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ if ( g == null || !g.isCalled() ) return; int mq0 = 0; - final ReadBackedPileup pileup = context.getBasePileup(); + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 ) mq0++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java index 9f542e3bd..21ee66ea2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -22,7 +23,12 @@ import java.util.Map; */ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java index ba4303b4a..8e4edaf0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -20,7 +21,12 @@ import java.util.Map; * The number of N bases, counting only SOLiD data */ public class NBaseCount extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index b62cd374b..a48d4a678 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -28,8 +29,13 @@ import java.util.Map; */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.hasLog10PError() || stratifiedContexts.size() == 0 ) + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + if ( !vc.hasLog10PError() ) return null; final GenotypesContext genotypes = vc.getGenotypes(); @@ -44,11 +50,20 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( !genotype.isHet() && !genotype.isHomVar() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; + if (stratifiedContexts!= null) { + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + depth += context.getBasePileup().depthOfCoverage(); - depth += context.getBasePileup().depthOfCoverage(); + } + else if (perReadAlleleLikelihoodMap != null) { + PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) + continue; + + depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + } } if ( depth == 0 ) @@ -67,39 +82,5 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int depth = 0; - - for ( final Genotype genotype : genotypes ) { - - // we care only about variant calls with likelihoods - if ( !genotype.isHet() && !genotype.isHomVar() ) - continue; - - final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); - if ( alleleBins == null ) - continue; - - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { - depth += alleleBin.getValue().size(); - } - } - - if ( depth == 0 ) - return null; - - double QD = -10.0 * vc.getLog10PError() / (double)depth; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", QD)); - return map; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 842fde8ad..680478da0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -18,10 +19,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -29,25 +27,48 @@ import java.util.Map; */ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + int totalSize = 0, index = 0; + int qualities[]; + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; - int totalSize = 0; - for ( AlignmentContext context : stratifiedContexts.values() ) - totalSize += context.size(); + for ( AlignmentContext context : stratifiedContexts.values() ) + totalSize += context.size(); - final int[] qualities = new int[totalSize]; - int index = 0; + qualities = new int[totalSize]; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - for (PileupElement p : pileup ) { - if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[index++] = p.getMappingQual(); + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + AlignmentContext context = sample.getValue(); + for (PileupElement p : context.getBasePileup() ) + index = fillMappingQualitiesFromPileupAndUpdateIndex(p.getRead(), index, qualities); } } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) + totalSize += perReadLikelihoods.size(); + + qualities = new int[totalSize]; + for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { + for (GATKSAMRecord read : perReadLikelihoods.getStoredElements()) + index = fillMappingQualitiesFromPileupAndUpdateIndex(read, index, qualities); + + + } + } + else + return null; + + double rms = MathUtils.rms(qualities); Map map = new HashMap(); @@ -55,32 +76,12 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + private static int fillMappingQualitiesFromPileupAndUpdateIndex(final GATKSAMRecord read, final int inputIdx, final int[] qualities) { + int outputIdx = inputIdx; + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[outputIdx++] = read.getMappingQuality(); - int depth = 0; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { - depth += alleleBin.getValue().size(); - } - } - - final int[] qualities = new int[depth]; - int index = 0; - - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final List reads : alleleBins.values() ) { - for ( final GATKSAMRecord read : reads ) { - if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[index++] = read.getMappingQuality(); - } - } - } - - final Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", MathUtils.rms(qualities))); - return map; + return outputIdx; } public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index bf6adcfac..ec873c5dd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.Pair; @@ -28,12 +29,15 @@ import java.util.Map; * Abstract root for all RankSum based annotations */ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { - static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() == 0) @@ -42,37 +46,28 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); - if ( vc.isSNP() ) { - final List altAlleles = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - altAlleles.add(a.getBases()[0]); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + PerReadAlleleLikelihoodMap indelLikelihoodMap = null; + ReadBackedPileup pileup = null; - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + + if (stratifiedContexts != null) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - - fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals); + if ( context != null ) + pileup = context.getBasePileup(); } - } else if ( vc.isIndel() || vc.isMixed() ) { + if (stratifiedPerReadAlleleLikelihoodMap != null ) + indelLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if (context == null) { - continue; - } + if (indelLikelihoodMap != null && indelLikelihoodMap.isEmpty()) + indelLikelihoodMap = null; + // treat an empty likelihood map as a null reference - will simplify contract with fillQualsFromPileup + if (indelLikelihoodMap == null && pileup == null) + continue; - final ReadBackedPileup pileup = context.getBasePileup(); - if (pileup == null) - continue; - - if (IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap() == null || - IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().size() == 0) - return null; - - fillIndelQualsFromPileup(pileup, refQuals, altQuals); - } - } else + fillQualsFromPileup(vc.getAlleles(), vc.getStart(), pileup, indelLikelihoodMap, refQuals, altQuals ); + } + if (refQuals.isEmpty() && altQuals.isEmpty()) return null; final MannWhitneyU mannWhitneyU = new MannWhitneyU(); @@ -103,50 +98,12 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if (genotypes == null || genotypes.size() == 0) - return null; - - final ArrayList refQuals = new ArrayList(); - final ArrayList altQuals = new ArrayList(); - - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { - final Map> context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - - fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), vc.getStart(), context, refQuals, altQuals); - } - - if ( refQuals.size() == 0 || altQuals.size() == 0 ) - return null; - - final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for (final Double qual : altQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); - } - for (final Double qual : refQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); - } - - // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); - - final Map map = new HashMap(); - if (!Double.isNaN(testResults.first)) - map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); - return map; - } - - protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, List altQuals); - - protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); - - protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); + protected abstract void fillQualsFromPileup(final List alleles, + final int refLoc, + final ReadBackedPileup readBackedPileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, + final List altQuals); /** * Can the base in this pileup element be used in comparative tests between ref / alt bases? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 3456041c7..1ac8ee113 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -5,7 +5,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -32,98 +32,64 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for (final PileupElement p : pileup) { - if (isUsableBase(p)) { - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); - final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, final List altQuals) { + if (alleleLikelihoodMap == null) { + // use fast SNP-based version if we don't have per-read allele likelihoods + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); - if ( p.getBase() == ref ) - refQuals.add((double) readPos); - else if ( alts.contains(p.getBase()) ) - altQuals.add((double) readPos); - } - } - } + readPos = getFinalReadPosition(p.getRead(),readPos); - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); - if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) - continue; - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); - - final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); - - if ( matchesRef ) - refQuals.add((double) readPos); - else - altQuals.add((double) readPos); - } - } - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele - // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. - // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). - // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. - // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" - // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p : pileup) { - if (indelLikelihoodMap.containsKey(p)) { - LinkedHashMap el = indelLikelihoodMap.get(p); // retrieve likelihood information corresponding to this read - double refLikelihood = 0.0, altLikelihood = Double.NEGATIVE_INFINITY; // by design, first element in LinkedHashMap was ref allele - - for (Map.Entry a : el.entrySet()) { - if (a.getKey().isReference()) - refLikelihood = a.getValue(); - else { - double like = a.getValue(); - if (like >= altLikelihood) - altLikelihood = like; + if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { + refQuals.add((double)readPos); + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { + altQuals.add((double)readPos); } } - - int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset()); - final int numAlignedBases = getNumAlignedBases(p.getRead()); - - if (readPos > numAlignedBases / 2) { - readPos = numAlignedBases - (readPos + 1); - } - //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); - - - // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads - // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. - // if (readPos < -1) - // return; - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { - refQuals.add((double) readPos); - //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); - } else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { - altQuals.add((double) readPos); - //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); - - } - - } + return; + } + + for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) + continue; + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); + final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + +// int readPos = getOffsetFromClippedReadStart(el.getKey(), el.getKey().getOffset()); + // readPos = getFinalReadPosition(el.getKey().getRead(),readPos); + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)readPos); + else if (allAlleles.contains(a)) + altQuals.add((double)readPos); + } } + int getFinalReadPosition(GATKSAMRecord read, int initialReadPosition) { + final int numAlignedBases = getNumAlignedBases(read); + + int readPos = initialReadPosition; + if (initialReadPosition > numAlignedBases / 2) { + readPos = numAlignedBases - (initialReadPosition + 1); + } + return readPos; + + } int getNumClippedBasesAtStart(SAMRecord read) { // compute total number of clipped bases (soft or hard clipped) // check for hard clips (never consider these bases): diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index 7e4d44cf2..abe55de5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -46,7 +47,12 @@ import java.util.Map; */ public class SampleList extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 4d990e738..f0bd7ecd9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -225,7 +226,12 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue())); } - public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); // Get only SnpEff records that start at this locus, not merely span it: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index af2df8e6a..f6bb4e747 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -22,7 +23,12 @@ import java.util.Map; */ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index f220ecbd2..3d62f2d96 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -47,7 +48,12 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa private static final String STR_PRESENT = "STR"; private static final String REPEAT_UNIT_KEY = "RU"; private static final String REPEATS_PER_ALLELE_KEY = "RPA"; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isIndel()) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java index 63694d809..43ef188a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -28,7 +29,12 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi private String n454 ="Num454"; private String nSolid = "NumSOLiD"; private String nOther = "NumOther"; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 2e3578dcb..c3e98c20f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -28,7 +29,12 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen private Set trios = null; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( trios == null ) { if ( walker instanceof VariantAnnotator ) { trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 073faf54e..a1bd8dcbd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -178,7 +179,18 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + VariantContext vc) { + return annotateContext(tracker, ref, stratifiedContexts, vc, null); + } + + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + VariantContext vc, + final Map perReadAlleleLikelihoodMap) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -189,7 +201,7 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc); + Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); if ( annotationsFromCurrentType != null ) infoAnnotations.putAll(annotationsFromCurrentType); } @@ -198,22 +210,25 @@ public class VariantAnnotatorEngine { VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); + return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); } - public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final Map perReadAlleleLikelihoodMap, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); + Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(perReadAlleleLikelihoodMap, vc); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); } } // generate a new annotated VC - return new VariantContextBuilder(vc).attributes(infoAnnotations).make(); + VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); + + // annotate genotypes, creating another new VC in the process + return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); } private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { @@ -266,20 +281,30 @@ public class VariantAnnotatorEngine { } } - private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + + private GenotypesContext annotateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext ref, final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( requestedGenotypeAnnotations.isEmpty() ) return vc.getGenotypes(); final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + AlignmentContext context = null; + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = null; + if (stratifiedContexts != null) + context = stratifiedContexts.get(genotype.getSampleName()); + if (stratifiedPerReadAlleleLikelihoodMap != null) + perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null ) { + if ( context == null && perReadAlleleLikelihoodMap == null) { + // no likelihoods nor pileup available: just move on to next sample genotypes.add(genotype); } else { final GenotypeBuilder gb = new GenotypeBuilder(genotype); for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb); + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } genotypes.add(gb.make()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java index de61c7741..9c5710872 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -10,8 +11,8 @@ import java.util.Map; // TODO -- make this an abstract class when we move away from InfoFieldAnnotation public interface ActiveRegionBasedAnnotation extends AnnotationType { - // return annotations for the given contexts split by sample and then allele - public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); + // return annotations for the given contexts split by sample and then read likelihood + public abstract Map annotate(final Map stratifiedContexts, final VariantContext vc); // return the descriptions used for the VCF INFO meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java index bc20f6c97..6c55c1c00 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -13,9 +14,14 @@ import java.util.List; public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts/genotype split by sample - public abstract void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, - ReferenceContext ref, AlignmentContext stratifiedContext, - VariantContext vc, Genotype g, GenotypeBuilder gb ); + public abstract void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap); // return the descriptions used for the VCF FORMAT meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java index 1569a605f..738be9883 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -11,8 +12,25 @@ import java.util.Map; public abstract class InfoFieldAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts split by sample - public abstract Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, - ReferenceContext ref, Map stratifiedContexts, VariantContext vc); + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + return annotate(tracker, walker, ref, stratifiedContexts, vc, null); + } + + public Map annotate(Map perReadAlleleLikelihoodMap, VariantContext vc) { + return annotate(null, null, null, null, vc, perReadAlleleLikelihoodMap); + } + + + public abstract Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap); // return the descriptions used for the VCF INFO meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 08c7da754..aec1bf7a8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -42,7 +42,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP protected RecalibrationTables recalibrationTables; public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { - this.covariates = covariates; + this.covariates = covariates.clone(); this.recalibrationTables = recalibrationTables; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 6fdc926d5..77da35c41 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -103,7 +103,8 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser); + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap); protected int getFilteredDepth(ReadBackedPileup pileup) { @@ -115,4 +116,5 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { return count; } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index bedffa690..ebfbc49fe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -48,24 +48,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; - private static ThreadLocal>> indelLikelihoodMap = - new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); - } - }; private LinkedHashMap haplotypeMap; - // gdebug removeme - // todo -cleanup - private GenomeLoc lastSiteVisited; private List alleleList = new ArrayList(); - static { - indelLikelihoodMap.set(new HashMap>()); - } - protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); @@ -93,16 +80,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { GenomeLoc loc = ref.getLocus(); // if (!ref.getLocus().equals(lastSiteVisited)) { if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { // starting a new site: clear allele list - lastSiteVisited = ref.getLocus(); - indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); - + perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); if (alleleList.isEmpty()) return null; @@ -130,10 +116,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ + // no likelihoods have been computed for this sample at this site + perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap()); + } final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey())); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); @@ -150,10 +140,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return builder.genotypes(genotypes).make(); } - public static HashMap> getIndelLikelihoodMap() { - return indelLikelihoodMap.get(); - } - public static void getHaplotypeMapFromAlleles(final List alleleList, final ReferenceContext ref, final GenomeLoc loc, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java new file mode 100644 index 000000000..9c0062876 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.genotyper; + + +//import org.broadinstitute.sting.gatk.walkers.Requires; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.*; + +public class PerReadAlleleLikelihoodMap { + public static final double INDEL_LIKELIHOOD_THRESH = 0.1; + + private List alleles; + private Map> likelihoodReadMap; + public PerReadAlleleLikelihoodMap() { + likelihoodReadMap = new LinkedHashMap>(); + alleles = new ArrayList(); + } + + public void add(GATKSAMRecord read, Allele a, Double likelihood) { + Map likelihoodMap; + if (likelihoodReadMap.containsKey(read)){ + // seen pileup element before + likelihoodMap = likelihoodReadMap.get(read); + } + else { + likelihoodMap = new HashMap(); + likelihoodReadMap.put(read,likelihoodMap); + } + likelihoodMap.put(a,likelihood); + + if (!alleles.contains(a)) + alleles.add(a); + + } + + public int size() { + return likelihoodReadMap.size(); + } + + public void add(PileupElement p, Allele a, Double likelihood) { + add(p.getRead(),a,likelihood); + } + + public boolean containsPileupElement(PileupElement p) { + return likelihoodReadMap.containsKey(p.getRead()); + } + + public boolean isEmpty() { + return likelihoodReadMap.isEmpty(); + } + + public Map> getLikelihoodReadMap() { + return likelihoodReadMap; + } + public void clear() { + alleles.clear(); + likelihoodReadMap.clear(); + } + + public Set getStoredElements() { + return likelihoodReadMap.keySet(); + } + + public Collection> getLikelihoodMapValues() { + return likelihoodReadMap.values(); + } + + public int getNumberOfStoredElements() { + return likelihoodReadMap.size(); + } + /** + * Returns list of reads greedily associated with a particular allele. + * Needs to loop for each read, and assign to each allele + * @param a Desired allele + * @return + */ + @Requires("a!=null") + public List getReadsAssociatedWithAllele(Allele a) { + return null; + } + + public Map getLikelihoodsAssociatedWithPileupElement(PileupElement p) { + if (!likelihoodReadMap.containsKey(p.getRead())) + return null; + + return likelihoodReadMap.get(p.getRead()); + } + + public static Allele getMostLikelyAllele(Map alleleMap) { + double minLike = Double.POSITIVE_INFINITY, maxLike = Double.NEGATIVE_INFINITY; + Allele mostLikelyAllele = Allele.NO_CALL; + + for (Map.Entry el : alleleMap.entrySet()) { + if (el.getValue() > maxLike) { + maxLike = el.getValue(); + mostLikelyAllele = el.getKey(); + } + + if (el.getValue() < minLike) + minLike = el.getValue(); + + } + if (maxLike-minLike > INDEL_LIKELIHOOD_THRESH) + return mostLikelyAllele; + else + return Allele.NO_CALL; + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 07d5d2f2d..76ba72017 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { + + perReadAlleleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index e755a1e36..30c0f3e18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -26,11 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -public class UnifiedArgumentCollection { +public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; @@ -42,13 +43,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; - /** - * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: - * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 - */ - @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -57,26 +51,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) - public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) - public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - - /** - * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with - * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this - * is the default). - */ - @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; - - /** - * This argument allows you to emit low quality calls as filtered records. - */ - @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - /** * Note that calculating the SLOD increases the runtime by an appreciable amount. */ @@ -90,12 +64,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; - /** - * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding - */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) - public RodBinding alleles; - /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -107,16 +75,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it - * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend - * that you not play around with this parameter. - */ - @Advanced - @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 3; - @Hidden @Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false) public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false; @@ -139,7 +97,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; - /** * This argument informs the prior probability of having an indel at a site. */ @@ -181,7 +138,6 @@ public class UnifiedArgumentCollection { Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only sample-level error model */ - @Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false) public boolean IGNORE_LANE_INFO = false; @@ -275,5 +231,16 @@ public class UnifiedArgumentCollection { return uac; } + public UnifiedArgumentCollection() { } + public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { + super(); + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 3d9724ffb..a57c877e0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -177,19 +177,23 @@ public class UnifiedGenotyperEngine { final List results = new ArrayList(2); final List models = getGLModelsToUse(tracker, refContext, rawContext); + + final Map perReadAlleleLikelihoodMap = new HashMap(); + if ( models.isEmpty() ) { results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); } else { for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + perReadAlleleLikelihoodMap.clear(); final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); if ( stratifiedContexts == null ) { results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); } else { - final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true)); + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); } } } @@ -219,9 +223,13 @@ public class UnifiedGenotyperEngine { * @param tracker the meta data tracker * @param refContext the reference base * @param rawContext contextual information around the locus + * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) * @return the VariantContext object */ - public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map perReadAlleleLikelihoodMap) { final List models = getGLModelsToUse(tracker, refContext, rawContext); if ( models.isEmpty() ) { return null; @@ -231,7 +239,7 @@ public class UnifiedGenotyperEngine { final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); // return the first valid one we encounter if ( stratifiedContexts != null ) - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); } @@ -247,7 +255,10 @@ public class UnifiedGenotyperEngine { * @param vc the GL-annotated variant context * @return the VariantCallContext object */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc) { final List models = getGLModelsToUse(tracker, refContext, rawContext); if ( models.isEmpty() ) { return null; @@ -256,7 +267,7 @@ public class UnifiedGenotyperEngine { // return the first one final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); } /** @@ -266,7 +277,7 @@ public class UnifiedGenotyperEngine { * @return the VariantCallContext object */ public VariantCallContext calculateGenotypes(VariantContext vc) { - return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), false); + return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); } @@ -277,14 +288,21 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, AlignmentContextUtils.ReadOrientation type, List alternateAllelesToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) { + private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final Map stratifiedContexts, + final AlignmentContextUtils.ReadOrientation type, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { // initialize the data for this thread if that hasn't been done yet if ( glcm.get() == null ) { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); + return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -315,12 +333,22 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, false); } - public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(null, null, null, null, vc, model); + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); } - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false); + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model, null); + } + + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map stratifiedContexts, + final VariantContext vc, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap); } /** @@ -334,8 +362,11 @@ public class UnifiedGenotyperEngine { * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final boolean inheritAttributesFromInputVC) { + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; @@ -462,7 +493,7 @@ public class UnifiedGenotyperEngine { List allAllelesToUse = builder.make().getAlleles(); // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model); + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); @@ -471,7 +502,7 @@ public class UnifiedGenotyperEngine { //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model); + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); @@ -507,7 +538,7 @@ public class UnifiedGenotyperEngine { final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 65c5a2fbc..9234a9fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.PairHMM; @@ -40,6 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { @@ -167,11 +169,15 @@ public class PairHMMIndelErrorModel { } - public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; - final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts); + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } @@ -181,7 +187,7 @@ public class PairHMMIndelErrorModel { final LinkedHashMap haplotypeMap, final ReferenceContext ref, final int eventLength, - final HashMap> indelLikelihoodMap, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final PairHMM pairHMM = new PairHMM(bandedLikelihoods); @@ -192,8 +198,8 @@ public class PairHMMIndelErrorModel { readCounts[readIdx] = p.getRepresentativeCount(); // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) - if (indelLikelihoodMap.containsKey(p)) { - HashMap el = indelLikelihoodMap.get(p); + if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { + Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); int j=0; for (Allele a: haplotypeMap.keySet()) { readLikelihoods[readIdx][j++] = el.get(a); @@ -201,7 +207,7 @@ public class PairHMMIndelErrorModel { } else { final int refWindowStart = ref.getWindow().getStart(); - final int refWindowStop = ref.getWindow().getStop(); + final int refWindowStop = ref.getWindow().getStop(); if (DEBUG) { System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); @@ -280,7 +286,7 @@ public class PairHMMIndelErrorModel { System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); - LinkedHashMap readEl = new LinkedHashMap(); + // LinkedHashMap readEl = new LinkedHashMap(); /** * Check if we'll end up with an empty read once all clipping is done @@ -288,7 +294,7 @@ public class PairHMMIndelErrorModel { if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { - readEl.put(a,0.0); + perReadAlleleLikelihoodMap.add(p,a,0.0); readLikelihoods[readIdx][j++] = 0.0; } } @@ -329,45 +335,45 @@ public class PairHMMIndelErrorModel { - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), - (int)indStart, (int)indStop); + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), + (int)indStart, (int)indStop); - final int X_METRIC_LENGTH = readBases.length+2; - final int Y_METRIC_LENGTH = haplotypeBases.length+2; + final int X_METRIC_LENGTH = readBases.length+2; + final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { - //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + if (matchMetricArray == null) { + //no need to reallocate arrays for each new haplotype, as length won't change + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - } + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + } - int startIndexInHaplotype = 0; - if (previousHaplotypeSeen != null) - startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - previousHaplotypeSeen = haplotypeBases.clone(); + int startIndexInHaplotype = 0; + if (previousHaplotypeSeen != null) + startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, - (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), - (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), + (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), + contextLogGapContinuationProbabilities, + startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - if (DEBUG) { - System.out.println("H:"+new String(haplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIndexInHaplotype); - } - readEl.put(a,readLikelihood); + if (DEBUG) { + System.out.println("H:"+new String(haplotypeBases)); + System.out.println("R:"+new String(readBases)); + System.out.format("L:%4.2f\n",readLikelihood); + System.out.format("StPos:%d\n", startIndexInHaplotype); + } + + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } } - indelLikelihoodMap.put(p,readEl); } readIdx++; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java index 9228dc375..042d4741d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -29,10 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -41,7 +38,7 @@ import java.util.*; * Date: Mar 10, 2011 */ -public class Tranche implements Comparable { +public class Tranche { private static final int CURRENT_VERSION = 5; public double ts, minVQSLod, knownTiTv, novelTiTv; @@ -83,10 +80,14 @@ public class Tranche implements Comparable { return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; } - public int compareTo(Tranche other) { - return Double.compare(this.ts, other.ts); + public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { + @Override + public int compare(final Tranche tranche1, final Tranche tranche2) { + return Double.compare(tranche1.ts, tranche2.ts); + } } + @Override public String toString() { return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); @@ -102,7 +103,7 @@ public class Tranche implements Comparable { final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); final PrintStream stream = new PrintStream(bytes); - Collections.sort(tranches); + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); stream.println("# Variant quality score tranches file"); stream.println("# Version number " + CURRENT_VERSION); @@ -183,7 +184,7 @@ public class Tranche implements Comparable { } } - Collections.sort(tranches); + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); return tranches; } catch( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(f, e); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java index af0778399..58b4e4fc7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java @@ -146,7 +146,7 @@ public class TrancheManager { public static List findTranches( final ArrayList data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size())); - Collections.sort(data); + Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); metric.calculateRunningMetric(data); if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e88505f99..33a543e39 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -158,7 +158,7 @@ public class VariantDataManager { logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants - Collections.sort( data ); + Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); if( numToAdd > data.size() ) { throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe." ); @@ -286,6 +286,7 @@ public class VariantDataManager { case INDEL: case MIXED: case SYMBOLIC: + case STRUCTURAL_INDEL: return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); default: return false; @@ -297,7 +298,7 @@ public class VariantDataManager { case SNP: return evalVC.isSNP() || evalVC.isMNP(); case INDEL: - return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); case BOTH: return true; default: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index a85129d78..7b3b0d17d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -27,13 +27,16 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.utils.GenomeLoc; +import java.io.Serializable; +import java.util.Comparator; + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Mar 4, 2011 */ -public class VariantDatum implements Comparable { +public class VariantDatum { public double[] annotations; public boolean[] isNull; @@ -52,8 +55,10 @@ public class VariantDatum implements Comparable { public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation - @Override - public int compareTo( final VariantDatum other ) { - return Double.compare(this.lod, other.lod); + public static class VariantDatumLODComparator implements Comparator, Serializable { + @Override + public int compare(final VariantDatum datum1, final VariantDatum datum2) { + return Double.compare(datum1.lod, datum2.lod); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bfd9aa52f..fc29a7f02 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -265,7 +265,7 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; - @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; /** @@ -322,6 +322,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; + @Argument(doc="indel size select",required=false,fullName="maxIndelSize") + private int maxIndelSize = Integer.MAX_VALUE; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -541,6 +544,9 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; + if ( badIndelSize(vc) ) + continue; + VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { @@ -572,6 +578,20 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private boolean badIndelSize(final VariantContext vc) { + if ( vc.getReference().length() > maxIndelSize ) { + return true; + } + + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.length() > maxIndelSize ) { + return true; + } + } + + return false; + } + private boolean hasPLs(final VariantContext vc) { for ( Genotype g : vc.getGenotypes() ) { if ( g.hasLikelihoods() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 3fba8fa77..7111bac46 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -76,47 +76,11 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; + initializeValidator(); + writeBedHeader(); + Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers - // write magic bits into the ped file - try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); - // ultimately, the bed will be in individual-major mode - } catch (IOException e) { - throw new ReviewedStingException("error writing to output file."); - } - // write to the fam file, the first six columns of the standard ped file - // first, load data from the input meta data file - Map> metaValues = new HashMap>(); - logger.debug("Reading in metadata..."); - try { - if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { - for ( String line : new XReadLines(metaDataFile) ) { - String[] famSplit = line.split("\\t"); - String sid = famSplit[1]; - outFam.printf("%s%n",line); - } - } else { - for ( String line : new XReadLines(metaDataFile) ) { - logger.debug(line); - String[] split = line.split("\\t"); - String sampleID = split[0]; - String keyVals = split[1]; - HashMap values = new HashMap(); - for ( String kvp : keyVals.split(";") ) { - String[] kvp_split = kvp.split("="); - values.put(kvp_split[0],kvp_split[1]); - } - metaValues.put(sampleID,values); - } - } - } catch (FileNotFoundException e) { - throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); - } // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype int dummyID = 0; // increments for dummy parental and family IDs used // want to be especially careful to maintain order here @@ -126,21 +90,23 @@ public class VariantsToBinaryPed extends RodWalker { continue; } for ( String sample : header.getValue().getGenotypeSamples() ) { - Map mVals = metaValues.get(sample); - if ( mVals == null ) { - throw new UserException("No metadata provided for sample "+sample); + if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + Map mVals = sampleMetaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } - if ( ! mVals.containsKey("phenotype") ) { - throw new UserException("No phenotype data provided for sample "+sample); - } - String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); - String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); - String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); - String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; - String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); try { - File temp = File.createTempFile(sample, ".tmp"); + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); printMap.put(sample,new PrintStream(temp)); tempFiles.put(sample,temp); } catch (IOException e) { @@ -216,6 +182,7 @@ public class VariantsToBinaryPed extends RodWalker { // reset the buffer for this sample genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } + byteCount = 0; } genotypeCount = 0; } @@ -337,4 +304,69 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } + + private void initializeValidator() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + } + + private void writeBedHeader() { + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + } + + private Map> parseMetaData() { + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + logger.debug("Reading in metadata..."); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\s+"); + if ( famSplit.length != 6 ) { + throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + } + String sid = famSplit[1]; + String fid = famSplit[0]; + String mom = famSplit[2]; + String dad = famSplit[3]; + String sex = famSplit[4]; + String pheno = famSplit[5]; + HashMap values = new HashMap(); + values.put("mom",mom); + values.put("dad",dad); + values.put("fid",fid); + values.put("sex",sex); + values.put("phenotype",pheno); + metaValues.put(sid,values); + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); + String[] split = line.split("\\s+"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + + return metaValues; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 13571df78..8c95091a6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils; +import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.util.Arrays; @@ -227,14 +228,21 @@ public class BaseUtils { } @Deprecated - static public boolean isRegularBase(char base) { + static public boolean isRegularBase( final char base ) { return simpleBaseToBaseIndex(base) != -1; } - static public boolean isRegularBase(byte base) { + static public boolean isRegularBase( final byte base ) { return simpleBaseToBaseIndex(base) != -1; } + static public boolean isAllRegularBases( final byte[] bases ) { + for( final byte base : bases) { + if( !isRegularBase(base) ) { return false; } + } + return true; + } + static public boolean isNBase(byte base) { return base == 'N' || base == 'n'; } @@ -437,29 +445,8 @@ public class BaseUtils { * @param bases the bases * @return the upper cased version */ - static public byte[] convertToUpperCase(final byte[] bases) { - for ( int i = 0; i < bases.length; i++ ) { - if ( (char)bases[i] >= 'a' ) - bases[i] = toUpperCaseBase(bases[i]); - } - return bases; - } - - static public byte toUpperCaseBase(final byte base) { - switch (base) { - case 'a': - return 'A'; - case 'c': - return 'C'; - case 'g': - return 'G'; - case 't': - return 'T'; - case 'n': - return 'N'; - default: - return base; - } + static public void convertToUpperCase(final byte[] bases) { + StringUtil.toUpperCase(bases); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index fcde1f419..befd24307 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -55,22 +55,22 @@ public class Haplotype { * @param bases bases * @param qual qual */ - public Haplotype(byte[] bases, int qual) { - this.bases = bases; + public Haplotype( final byte[] bases, final int qual ) { + this.bases = bases.clone(); quals = new double[bases.length]; Arrays.fill(quals, (double)qual); } - public Haplotype(byte[] bases, double[] quals) { - this.bases = bases; - this.quals = quals; + public Haplotype( final byte[] bases, final double[] quals ) { + this.bases = bases.clone(); + this.quals = quals.clone(); } - public Haplotype(byte[] bases) { + public Haplotype( final byte[] bases ) { this(bases, 0); } - public Haplotype(byte[] bases, GenomeLoc loc) { + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; } @@ -140,10 +140,10 @@ public class Haplotype { } public double[] getQuals() { - return quals; + return quals.clone(); } public byte[] getBases() { - return bases; + return bases.clone(); } public long getStartPosition() { diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index d1bc75583..8339e38c9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; +import java.io.Serializable; import java.util.Comparator; import java.util.TreeSet; @@ -434,12 +435,14 @@ public class MannWhitneyU { * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. */ - private class DitheringComparator implements Comparator> { + private static class DitheringComparator implements Comparator>, Serializable { public DitheringComparator() {} + @Override public boolean equals(Object other) { return false; } + @Override public int compare(Pair left, Pair right) { double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); if ( comp > 0 ) { return 1; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 7d1561fc5..0c096ea73 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -625,6 +625,10 @@ public class MathUtils { return maxElementIndex(array, array.length); } + public static int maxElementIndex(final byte[] array) { + return maxElementIndex(array, array.length); + } + public static int maxElementIndex(final int[] array, int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -638,6 +642,24 @@ public class MathUtils { return maxI; } + public static int maxElementIndex(final byte[] array, int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static byte arrayMax(final byte[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMax(final double[] array) { return array[maxElementIndex(array)]; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 8e660350f..decc54d47 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.activeregion; import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; @@ -15,7 +16,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation, Comparable { +public class ActiveRegion implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -58,9 +59,7 @@ public class ActiveRegion implements HasGenomeLocation, Comparable } public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return referenceReader.getSubsequenceAt( extendedLoc.getContig(), - Math.max(1, extendedLoc.getStart() - padding), - Math.min(referenceReader.getSequenceDictionary().getSequence(extendedLoc.getContig()).getSequenceLength(), extendedLoc.getStop() + padding) ).getBases(); + return getReference( referenceReader, padding, extendedLoc ); } public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { @@ -68,14 +67,15 @@ public class ActiveRegion implements HasGenomeLocation, Comparable } public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), - Math.max(1, fullExtentReferenceLoc.getStart() - padding), - Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); + return getReference( referenceReader, padding, fullExtentReferenceLoc ); } - @Override - public int compareTo( final ActiveRegion other ) { - return this.getLocation().compareTo(other.getLocation()); + private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), + Math.max(1, genomeLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); + StringUtil.toUpperCase(reference); + return reference; } @Override @@ -97,4 +97,19 @@ public class ActiveRegion implements HasGenomeLocation, Comparable if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; return true; } + + /** + * A comparator class which is used to sort ActiveRegions by their start location + */ + /* + public static class ActiveRegionStartLocationComparator implements Comparator { + + public ActiveRegionStartLocationComparator() {} + + @Override + public int compare(final ActiveRegion left, final ActiveRegion right) { + return left.getLocation().compareTo(right.getLocation()); + } + } + */ } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index 2c312678e..85c925204 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -105,7 +105,7 @@ public class Allele implements Comparable { if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); } else { - bases = BaseUtils.convertToUpperCase(bases); + BaseUtils.convertToUpperCase(bases); } this.isRef = isRef; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1fe6b8652..8015889f5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.apache.commons.math.stat.descriptive.rank.Max; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; @@ -178,9 +179,8 @@ import java.util.*; */ public class VariantContext implements Feature { // to enable tribble integration private final static boolean WARN_ABOUT_BAD_END = true; + private final static long MAX_ALLELE_SIZE_FOR_NON_SV = 150; final protected static Logger logger = Logger.getLogger(VariantContext.class); - - private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -458,6 +458,7 @@ public class VariantContext implements Feature { // to enable tribble integratio SNP, MNP, // a multi-nucleotide polymorphism INDEL, + STRUCTURAL_INDEL, SYMBOLIC, MIXED, } @@ -530,6 +531,18 @@ public class VariantContext implements Feature { // to enable tribble integratio return getType() == Type.SYMBOLIC; } + public boolean isStructuralIndel() { + return getType() == Type.STRUCTURAL_INDEL; + } + + /** + * + * @return true if the variant is symbolic or a large indel + */ + public boolean isSymbolicOrSV() { + return isSymbolic() || isStructuralIndel(); + } + public boolean isMNP() { return getType() == Type.MNP; } @@ -1250,6 +1263,14 @@ public class VariantContext implements Feature { // to enable tribble integratio // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + + // Because a number of structural variation callers write the whole alternate allele into the VCF where possible, + // this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now + // classify these as structural events, rather than indel events, as we think differently about the mechanism, + // representation, and handling of these events. Check for this case here: + if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV ) + return Type.STRUCTURAL_INDEL; + return Type.INDEL; // old incorrect logic: diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index d8ab4bd23..40ac089ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -94,6 +94,7 @@ public class VariantContextBuilder { this.start = start; this.stop = stop; this.alleles = alleles; + this.attributes = Collections.emptyMap(); // immutable toValidate.add(VariantContext.Validation.ALLELES); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index dc908c323..edd97f17f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -104,7 +104,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); after.setCigarString("10M"); - List reads = Arrays.asList(before,during,after); + List reads = Arrays.asList(before, during, after); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads,readAttributes); @@ -134,9 +134,9 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // create a test version of the Reads object ReadProperties readAttributes = createTestReadProperties(); - SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,firstLocus,76); + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); indelOnlyRead.setCigarString("76I"); List reads = Arrays.asList(indelOnlyRead); @@ -148,10 +148,10 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // and considers it to be an indel-containing read. Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus,"Base pileup is at incorrect location."); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); ReadBackedPileup basePileup = alignmentContext.getBasePileup(); Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); - Assert.assertSame(basePileup.getReads().get(0),indelOnlyRead,"Read in pileup is incorrect"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); } /** @@ -168,7 +168,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { leadingRead.setCigarString("1M75I"); SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); indelOnlyRead.setCigarString("76I"); @@ -177,7 +177,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); fullMatchAfterIndel.setCigarString("75I1M"); - List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); @@ -204,7 +204,55 @@ public class LocusIteratorByStateUnitTest extends BaseTest { numAlignmentContextsFound++; } - Assert.assertEquals(numAlignmentContextsFound,2,"Found incorrect number of alignment contexts"); + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } } private static ReadProperties createTestReadProperties() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 17d27c156..aa4fd7a75 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("95b0627bfcac2191aed9908904e892ff")); + Arrays.asList("4a0318d0452d2dccde48ef081c431bf8")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("0e2509349fd6c8a9e9408c918215e1de")); + Arrays.asList("da19c8e3c58340ba8bcc88e95ece4ac1")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("32d81a7797605afb526983a2ab45efc2")); + Arrays.asList("cdefe79f46482a3d050ca2132604663a")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("350539ccecea0d1f7fffd4ac29c015e7")); + Arrays.asList("5ec4c07b6801fca7013e3b0beb8b5418")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("c222361819fae035a0162f876990fdee")); + Arrays.asList("28c07151f5c5fae87c691d8f7d1a3929")); executeTest("test overwriting header", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 6f1370008..9bec1b75d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -9,7 +9,7 @@ import java.util.Arrays; import java.util.List; /** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * Integration tests for the Depth of Coverage walker * * @Author chartl * @Date Feb 25, 2010 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7b6e1ee96..7390ec206 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("0039fd0464c87e6ce66c4c8670fd8dfa")); + Arrays.asList("9a7fa3e9ec8350e3e9cfdce0c00ddcc3")); executeTest("test MultiSample Pilot1", spec); } @@ -36,7 +36,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d1e68d4db6585ec00213b1d2d05e01a9")); + Arrays.asList("78693f3bf5d588e250507a596aa400da")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("b53860d209f8440f12b78d01606553e1")); + Arrays.asList("babf24ec8e5b5708d4a049629f7ea073")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("61007c22c00a2871237280914a8f88f0")); + Arrays.asList("754187e70c1d117087e2270950a1c230")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("feda4a38bba096f7b740a146055509c2")); + Arrays.asList("f9a2f882d050a90e6d8e6a1fba00f858")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("0ff525e65c5836289c454c76ead5d80e")); + Arrays.asList("8a4ad38ec8015eea3461295148143428")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "e1a17f8f852c3d639f26e659d37bc1e5"; + private final static String COMPRESSED_OUTPUT_MD5 = "ebb42960e115fb8dacd3edff5541b4da"; @Test public void testCompressedOutput() { @@ -139,7 +139,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("b0b92abbaaa4c787dce6f1b302f983ee")); + Arrays.asList("91f7e112200ed2c3b0a5d0d9e16e9369")); executeTest("test min_base_quality_score 26", spec); } @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("186d33429756c89aad6cd89424d6dc94")); + Arrays.asList("b86e52b18496ab43a6b9a1bda632b5e6")); executeTest("test SLOD", spec); } @@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("11b87f68b8530da168c1418513115f30")); + Arrays.asList("79b3e4f8b4476ce3c3acbc271d6ddcdc")); executeTest("test NDA", spec); } @@ -163,23 +163,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("d2be4b1af1f29579c4f96c08e1ddd871")); + Arrays.asList("bf7f21a600956eda0a357b97b21e3069")); executeTest("test using comp track", spec); } @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "0055bd060e6ef53a6b836903d68953c9"); + testOutputParameters("-sites_only", "976109543d8d97d94e0fe0521ff326e8"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "235bec0a7b2d901442261104db18f5eb"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8084a847f4a3c53a030e8c52eec35cea"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "7c57ede7019063c19aa9d2136045d84f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "931e396f2a6903a291e813c64c18f8b5"); } private void testOutputParameters(final String args, final String md5) { @@ -193,7 +193,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("3f8d724a5158adac4df38c4e2ed04167")); + Arrays.asList("e94be02fc5484c20b512840884e3d463")); executeTest("test confidence 1", spec1); } @@ -201,7 +201,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("3f8d724a5158adac4df38c4e2ed04167")); + Arrays.asList("e94be02fc5484c20b512840884e3d463")); executeTest("test confidence 2", spec2); } @@ -212,12 +212,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "7e7384a3a52e19f76f368c2f4561d510" ); + testHeterozosity( 0.01, "0dca2699f709793026b853c6f339bf08" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "3d16366d870c086e894c07c9da411795" ); + testHeterozosity( 1.0 / 1850, "35f14e436927e64712a8e28080e90c91" ); } private void testHeterozosity(final double arg, final String md5) { @@ -241,7 +241,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("58abc4f504d3afd42271e290ac846c4b")); + Arrays.asList("0360b79163aa28ae66d0dde4c26b3d76")); executeTest(String.format("test multiple technologies"), spec); } @@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("e247f579f01eb698cfa1ae1e8a3995a8")); + Arrays.asList("59892388916bdfa544750ab76e43eabb")); executeTest(String.format("test calling with BAQ"), spec); } @@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("cc2167dce156f70f5a31ac3dce499266")); + Arrays.asList("6aa034f669ec09ac4f5a28624cbe1830")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1268bde77842e6bb6a4f337c1d589f4d")); + Arrays.asList("ba7a011d0c665acc4455d58a6ab28716")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -307,7 +307,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("10c86ff98ad5ab800d208b435bcfbd7d")); + Arrays.asList("4f7d80f4f53ef0f0959414cb30097482")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("c0c4dbb050296633a3150b104b77e05a")); + Arrays.asList("95986d0c92436d3b9c1f1be9c768a368")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2472722f87f8718861698f60bbba2462")); + Arrays.asList("cecd3e35a817e299e97e8f7bbf083d2c")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -335,13 +335,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("eeb64b261f0a44aa478d753dbbf9378e")); + Arrays.asList("c3f786a5228346b43a80aa80d22b1490")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("d0a66c234056bb83dd84113bc2421f1e")); + Arrays.asList("1a4d856bfe53d9acee0ea303c4b83bb1")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -351,7 +351,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("db0f91abb901e097714d8755058e1319")); + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -363,7 +363,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b3c923ed9efa04b85fc18a9b45c8d2a6")); + Arrays.asList("59ff26d7e5ca2503ebe9f74902251551")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -397,7 +397,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("160600dfa8e46f91dbb5d574517aac74")); + Arrays.asList("f99f9a917529bfef717fad97f725d5df")); executeTest("test minIndelFraction 0.0", spec); } @@ -405,7 +405,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("aa58dc9f77132c30363562bcdc321f6e")); + Arrays.asList("eac2cd649bd5836068350eb4260aaea7")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java new file mode 100644 index 000000000..07e82b869 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -0,0 +1,92 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/20/12 + * Time: 9:57 PM + * To change this template use File | Settings | File Templates. + */ +public class VariantsToBinaryPedIntegrationTest extends WalkerTest { + + public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/"; + + public static String baseTestString(String inputVCF, String inputMetaData, int gq) { + return "-T VariantsToBinaryPed -R " + b37KGReference + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + + @Test + public void testNA12878Alone() { + String testName = "testNA12878Alone"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testNA12878AloneMetaData() { + String testName = "testNA12878AloneMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrio() { + String testName = "testCEUTrio"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrioMetaData() { + String testName = "testCEUTrioMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testMalformedFam() { + String testName = "testMalformedFam"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } +} + + diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index 272166c68..19620b8df 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -750,6 +750,10 @@ public class VariantContextUnitTest extends BaseTest { modified = new VariantContextBuilder(modified).attributes(null).attribute("AC", 1).make(); Assert.assertEquals(modified.getAttribute("AC"), 1); + // test the behavior when the builder's attribute object is not initialized + modified = new VariantContextBuilder(modified.getSource(), modified.getChr(), modified.getStart(), modified.getEnd(), modified.getAlleles()).attribute("AC", 1).make(); + + // test normal attribute modification modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make(); Assert.assertEquals(modified.getAttribute("AC"), 1); modified = new VariantContextBuilder(modified).attribute("AC", 2).make(); diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 041e84a8c..775847ba9 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -105,8 +105,7 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) - - qGraph.settings = settings + qGraph.initializeWithSettings(settings) val allQScripts = pluginManager.createAllTypes(); for (script <- allQScripts) { @@ -137,26 +136,9 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) - if (!settings.disableJobReport) { - val jobStringName = { - if (settings.jobReportFile != null) - settings.jobReportFile - else - settings.qSettings.runName + ".jobreport.txt" - } - - if (!shuttingDown) { - val reportFile = IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) - logger.info("Writing JobLogging GATKReport to file " + reportFile) - QJobReport.printReport(functionsAndStatus, reportFile) - - if (settings.run) { - val pdfFile = IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) - } - } - } + // write the final complete job report + logger.info("Writing final jobs report...") + qGraph.writeJobsReport() if (!qGraph.success) { logger.info("Done with errors") diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 1a50301f1..429428c4c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -52,8 +52,8 @@ class QSettings { @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) var jobEnvironmentNames: Seq[String] = Nil - @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) - var memoryLimit: Option[Double] = None + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + var memoryLimit: Option[Double] = Some(2) @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) var memoryLimitThreshold: Option[Double] = None diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index e3a1714ff..2c33596e1 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -39,7 +39,7 @@ import collection.immutable.{TreeSet, TreeMap} import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction} import java.util.Date import org.broadinstitute.sting.utils.Utils -import org.apache.commons.io.{FileUtils, IOUtils} +import org.apache.commons.io.{FilenameUtils, FileUtils, IOUtils} import java.io.{OutputStreamWriter, File} /** @@ -71,6 +71,16 @@ class QGraph extends Logging { private val inProcessManager = new InProcessJobManager private def managers = Seq[Any](inProcessManager, commandLineManager) + /** + * If true, we will write out incremental job reports + */ + private val INCREMENTAL_JOBS_REPORT = true + + /** + * Holds the optional jobInfoReporter structure + */ + private var jobInfoReporter: QJobsReporter = null + private class StatusCounts { var pending = 0 var running = 0 @@ -79,6 +89,19 @@ class QGraph extends Logging { } private val statusCounts = new StatusCounts + /** + * Final initialization step of this QGraph -- tell it runtime setting information + * + * The settings aren't necessarily available until after this QGraph object has been constructed, so + * this function must be called once the QGraphSettings have been filled in. + * + * @param settings + */ + def initializeWithSettings(settings: QGraphSettings) { + this.settings = settings + this.jobInfoReporter = createJobsReporter() + } + /** * Adds a QScript created CommandLineFunction to the graph. * @param command Function to add to the graph. @@ -467,6 +490,12 @@ class QGraph extends Logging { checkRetryJobs(failedJobs) } + // incremental + if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) { + logger.info("Writing incremental jobs reports...") + writeJobsReport(false) + } + readyJobs ++= getReadyJobs } @@ -1084,6 +1113,39 @@ class QGraph extends Logging { } } + /** + * Create the jobsReporter for this QGraph, based on the settings data. + * + * Must be called after settings has been initialized properly + * + * @return + */ + private def createJobsReporter(): QJobsReporter = { + val jobStringName = if (settings.jobReportFile != null) + settings.jobReportFile + else + settings.qSettings.runName + ".jobreport.txt" + + val reportFile = org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) + + val pdfFile = if ( settings.run ) + Some(org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf")) + else + None + + new QJobsReporter(settings.disableJobReport, reportFile, pdfFile) + } + + /** + * Write, if possible, the jobs report + */ + def writeJobsReport(plot: Boolean = true) { + // note: the previous logic didn't write the job report if the system was shutting down, but I don't + // see any reason not to write the job report + if ( jobInfoReporter != null ) + jobInfoReporter.write(this, plot) + } + /** * Returns true if the graph was shutdown instead of exiting on its own. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index c69a310b3..0600f9ad5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -25,13 +25,8 @@ package org.broadinstitute.sting.queue.util import org.broadinstitute.sting.queue.function.QFunction -import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} -import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.gatk.report.GATKReportTable import org.broadinstitute.sting.queue.engine.JobRunInfo -import java.io.{PrintStream, File} -import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} -import org.broadinstitute.sting.utils.io.Resource -import org.apache.commons.io.{IOUtils, FileUtils} /** * A mixin to add Job info to the class @@ -98,31 +93,10 @@ trait QJobReport extends Logging { } object QJobReport { - val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" - // todo -- fixme to have a unique name for Scatter/gather jobs as well var seenCounter = 1 var seenNames = Set[String]() - def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { - val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) - jobs foreach {case (qf, info) => qf.setRunInfo(info)} - val stream = new PrintStream(FileUtils.openOutputStream(dest)) - try { - printJobLogging(jobs.keys.toSeq, stream) - } finally { - IOUtils.closeQuietly(stream) - } - } - - def plotReport(reportFile: File, pdfFile: File) { - val executor = new RScriptExecutor - executor.addLibrary(RScriptLibrary.GSALIB) - executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) - executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) - executor.exec() - } - def workAroundSameJobNames(func: QFunction):String = { if ( seenNames.apply(func.jobName) ) { seenCounter += 1 @@ -132,45 +106,4 @@ object QJobReport { func.jobName } } - - /** - * Prints the JobLogging logs to a GATKReport. First splits up the - * logs by group, and for each group generates a GATKReportTable - */ - private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { - // create the report - val report: GATKReport = new GATKReport - - // create a table for each group of logs - for ( (group, groupLogs) <- groupLogs(logs) ) { - val keys = logKeys(groupLogs) - report.addTable(group, "Job logs for " + group, keys.size) - val table: GATKReportTable = report.getTable(group) - - // add the columns - keys.foreach(table.addColumn(_)) - for (log <- groupLogs) { - for ( key <- keys ) - table.set(log.getReportName, key, log.getReportFeature(key)) - } - } - - report.print(stream) - } - - private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { - logs.groupBy(_.getReportGroup) - } - - private def logKeys(logs: Seq[QFunction]): Set[String] = { - // the keys should be the same for each log, but we will check that - val keys = Set[String](logs(0).getReportFeatureNames : _*) - - for ( log <- logs ) - if ( keys.sameElements(Set(log.getReportFeatureNames)) ) - throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + - "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) - - keys - } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala new file mode 100644 index 000000000..a23fe4485 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.util + +import java.io.{PrintStream, File} +import org.broadinstitute.sting.utils.io.{Resource} +import org.broadinstitute.sting.queue.engine.{JobRunInfo, QGraph} +import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} +import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.apache.commons.io.{FileUtils, IOUtils} + +/** + * Writes out RunInfo to a GATKReport + */ +class QJobsReporter(val disabled: Boolean, val reportFile: File, val pdfFile: Option[File]) extends Logging { + private val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" + + /** + * Write out a job report based on the finished jobs graph + * @param jobGraph + * @param enabledPlotting if true, we will plot the report as well with the JOB_REPORT_QUEUE_SCRIPT + */ + def write(jobGraph: QGraph, enabledPlotting: Boolean) { + if ( ! disabled ) { + logger.info("Writing JobLogging GATKReport to file " + reportFile) + printReport(jobGraph.getFunctionsAndStatus, reportFile) + + if ( enabledPlotting ) + pdfFile match { + case Some(file) => + logger.info("Plotting JobLogging GATKReport to file " + file) + plotReport(reportFile, file) + case None => + } + } + } + + private def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { + val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) + jobs foreach {case (qf, info) => qf.setRunInfo(info)} + val stream = new PrintStream(FileUtils.openOutputStream(dest)) + try { + printJobLogging(jobs.keys.toSeq, stream) + } finally { + IOUtils.closeQuietly(stream) + } + } + + private def plotReport(reportFile: File, pdfFile: File) { + val executor = new RScriptExecutor + executor.addLibrary(RScriptLibrary.GSALIB) + executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) + executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) + executor.exec() + } + + /** + * Prints the JobLogging logs to a GATKReport. First splits up the + * logs by group, and for each group generates a GATKReportTable + */ + private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { + // create the report + val report: GATKReport = new GATKReport + + // create a table for each group of logs + for ( (group, groupLogs) <- groupLogs(logs) ) { + val keys = logKeys(groupLogs) + report.addTable(group, "Job logs for " + group, keys.size) + val table: GATKReportTable = report.getTable(group) + + // add the columns + keys.foreach(table.addColumn(_)) + for (log <- groupLogs) { + for ( key <- keys ) + table.set(log.getReportName, key, log.getReportFeature(key)) + } + } + + report.print(stream) + } + + private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { + logs.groupBy(_.getReportGroup) + } + + private def logKeys(logs: Seq[QFunction]): Set[String] = { + // the keys should be the same for each log, but we will check that + val keys = Set[String](logs(0).getReportFeatureNames : _*) + + for ( log <- logs ) + if ( keys.sameElements(Set(log.getReportFeatureNames)) ) + throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + + "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) + + keys + } +}