diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java index 1f060ba7d..7d2882dbb 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java @@ -44,7 +44,7 @@ public class StratifiedAlignmentContext { // Definitions: // COMPLETE = full alignment context - // FORWARD = reads on forward strand + // FORWARD = reads on forward strand // REVERSE = reads on forward strand // public enum StratifiedContextType { COMPLETE, FORWARD, REVERSE } @@ -82,6 +82,18 @@ public class StratifiedAlignmentContext { offsets[StratifiedContextType.COMPLETE.ordinal()].add(offset); } + /** + * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample. + * + * @param pileup the original pileup + * + * @return a Map of sample name to StratifiedAlignmentContext + * + **/ + public static Map splitContextBySample(ReadBackedPileup pileup) { + return splitContextBySample(pileup, null, null); + } + /** * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample. * diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index ebd663642..b247fd03c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -11,7 +11,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; import java.util.Map; -public class MappingQualityZero implements VariantAnnotation { +public class MappingQualityZero extends StandardVariantAnnotation { public String annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, Variation variation) { int mq0 = 0; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index b7af1d418..23a482f6e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -166,7 +166,7 @@ public class VariantAnnotator extends RodWalker { if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 && variant.isBiallelic() && variant.isSNP() ) { - Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup(), null, null); + Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup()); if ( stratifiedContexts != null ) annotations = getAnnotations(tracker, ref, stratifiedContexts, variant, requestedAnnotations); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index cc70ff1db..806e1ebc9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -75,9 +75,15 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 20; + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) + public int MIN_MAPPING_QUALTY_SCORE = 10; + @Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false) public int MAX_MISMATCHES = 3; + @Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false) + public boolean USE_BADLY_MATED_READS = false; + @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 66ae3ac20..b50751504 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -49,7 +48,6 @@ import java.io.FileNotFoundException; * multi-sample, and pooled data. The user can choose from several different incorporated calculation models. */ @Reference(window=@Window(start=-20,stop=20)) -@ReadFilters({ZeroMappingQualityReadFilter.class,MappingQualityReadFilter.class,BadMateReadFilter.class}) public class UnifiedGenotyper extends LocusWalker>, Integer> implements TreeReducible { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -178,15 +176,16 @@ public class UnifiedGenotyper extends LocusWalker args = new HashSet(); @@ -229,10 +228,10 @@ public class UnifiedGenotyper extends LocusWalker annotations; if ( UAC.ALL_ANNOTATIONS ) annotations = VariantAnnotator.getAllAnnotations(tracker, refContext, stratifiedContexts, call.first); @@ -267,14 +269,16 @@ public class UnifiedGenotyper extends LocusWalker filteredPileup = new ArrayList(); for ( PileupElement p : pileup ) { - if ( AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= maxMismatches ) + if ( p.getMappingQual() >= UAC.MIN_MAPPING_QUALTY_SCORE && + (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) && + AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES ) filteredPileup.add(p); } return new ReadBackedPileup(pileup.getLocation(), filteredPileup); - } private static boolean isValidDeletionFraction(double d) { diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 4af47af9f..5b67715da 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -112,11 +112,13 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { Map genotypeMap = genotypeListToSampleNameMap(genotypes); + int totalAlleles = 0; for (String name : mHeader.getGenotypeSamples()) { if (genotypeMap.containsKey(name)) { Genotype gtype = genotypeMap.get(name); VCFGenotypeRecord record = VCFUtils.createVCFGenotypeRecord(params, (VCFGenotypeCall)gtype); params.addGenotypeRecord(record); + totalAlleles += record.getAlleles().size(); genotypeMap.remove(name); } else { VCFGenotypeRecord record = createNoCallRecord(name); @@ -132,6 +134,20 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { // info fields Map infoFields = getInfoFields((VCFVariationCall)locusdata); + if ( totalAlleles > 0 ) { + infoFields.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", totalAlleles)); + + // the allele counts are a bit trickier - we need to count up the alternate counts to get the ref count + List altAlleleCounts = params.getAlleleCounts(); + int totalAltAlleles = 0; + StringBuffer sb = new StringBuffer(); + for ( int alleleCount : altAlleleCounts ) { + sb.append(","); + sb.append(alleleCount); + totalAltAlleles += alleleCount; + } + infoFields.put(VCFRecord.ALLELE_COUNT_KEY, String.format("%d%s", (totalAlleles - totalAltAlleles), sb.toString())); + } // q-score double qual = (locusdata == null) ? 0 : ((VCFVariationCall)locusdata).getConfidence(); @@ -152,7 +168,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { VCFRecord.UNFILTERED, infoFields, params.getFormatString(), - params.getGenotypesRecords()); + params.getGenotypeRecords()); mWriter.addRecord(vcfRecord, validationStringency); } @@ -180,7 +196,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { infoFields.putAll(otherFields); } } - // no longer used: infoFields.put(VCFRecord.SAMPLE_NUMBER_KEY, String.valueOf(params.getGenotypesRecords().size())); return infoFields; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java index eca90ff84..db644039c 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFParameters.java @@ -16,9 +16,10 @@ class VCFParameters { private int position = 0; private String contig = null; private boolean initialized = false; - private List genotypesRecord = new ArrayList(); + private List genotypeRecords = new ArrayList(); private List formatList = new ArrayList(); private List alternateBases = new ArrayList(); + private List alleleCounts = new ArrayList(); public void setLocations(GenomeLoc location, char refBase) { // if we haven't set it up, we initialize the object @@ -56,7 +57,12 @@ class VCFParameters { } public void addGenotypeRecord(VCFGenotypeRecord record) { - this.genotypesRecord.add(record); + genotypeRecords.add(record); + for ( VCFGenotypeEncoding allele : record.getAlleles() ) { + int index = alternateBases.indexOf(allele); + if ( index != -1 ) // we don't keep track of ref alleles here + alleleCounts.set(index, alleleCounts.get(index)+1); + } } public void addFormatItem(String item) { @@ -67,19 +73,26 @@ class VCFParameters { public void addAlternateBase(VCFGenotypeEncoding base) { if ( !alternateBases.contains(base) && !base.toString().equals(String.valueOf(getReferenceBase()).toUpperCase()) && - !base.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) ) + !base.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) ) { alternateBases.add(base); + alleleCounts.add(0); + } } public List getAlternateBases() { return alternateBases; } + // the list of allele counts where each entry relates to the corresponding entry in the Alternate Base list + public List getAlleleCounts() { + return alleleCounts; + } + public String getFormatString() { return Utils.join(VCFRecord.FORMAT_FIELD_SEPERATOR, formatList); } - public List getGenotypesRecords() { - return genotypesRecord; + public List getGenotypeRecords() { + return genotypeRecords; } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java index 362e11b24..67a721b26 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -180,7 +180,6 @@ public class VCFUtils { Map infoFields = new HashMap(); infoFields.put(VCFRecord.DEPTH_KEY, String.format("%d", totalReadDepth)); - // no longer used: infoFields.put(VCFRecord.SAMPLE_NUMBER_KEY, String.valueOf(params.getGenotypesRecords().size())); // set the overall strand bias and allele frequency to be the average of all entries we've seen if ( SLODsSeen > 0 ) @@ -197,7 +196,7 @@ public class VCFUtils { filters.size() == 0 ? "0" : Utils.join(";", filters), infoFields, params.getFormatString(), - params.getGenotypesRecords()); + params.getGenotypeRecords()); } /** diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 32b36054f..1a4aa3d25 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF," + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("b0012ffce509196885bfdf1612086263")); + Arrays.asList("ad0d78f399bba23227cf07341708e27d")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("7e00e94a046e82ac41f6fd298a64cc74")); + Arrays.asList("17119c073a964620809eaaf3a39dfbc4")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF," + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a6630eec84161a089dde204e09744c45")); + Arrays.asList("b8609ff527e0882dfbe3505357fd2705")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF," + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("130658564083f650981b28728b813917")); + Arrays.asList("e865132880c6aa07bfdd725408fc21e8")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 82c726c73..d5e0bcea5 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -22,7 +22,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, - Arrays.asList("8acbb0879a7c9f35e7cf7d77a7cab850")); + Arrays.asList("706efd396f3ac5e3addff73c15d1ec87")); executeTest("testMultiSamplePilot1 - Point Estimate EM", spec); } @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, - Arrays.asList("a5ec48e6695d9a5e7fcced7775baf6ff")); + Arrays.asList("917d9ede2b2bccbcc41c6b7534788fa2")); executeTest("testMultiSamplePilot2 - Point Estimate EM", spec); } @@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testPooled1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1, - Arrays.asList("c20f370440ed55b0b6c38715985a850d")); + Arrays.asList("48b3719bf39705983876276b8da2642f")); executeTest("testPooled1", spec); } @@ -56,7 +56,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("0a2df3fc9fc72a3ace46aab899275636")); + Arrays.asList("057493e22a64cf5c78619707464cedbe")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -64,7 +64,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("52249b69d5d984a8bdc0438e1f106dcb")); + Arrays.asList("05f32f8ba686260b79c98c661a01e8f8")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -72,7 +72,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("172ca88b718f24fa0f2a3460b2dab33d")); + Arrays.asList("5fcc0ecff04d408d33ce5fdc0a8d8b89")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -85,7 +85,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testParallelization() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,400,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30 -nt 4", 1, - Arrays.asList("d86d9148b1eee16da9a5bb57c929c0c9")); + Arrays.asList("b0e221e54f8ff9c6355654510acb7153")); executeTest("test parallelization", spec); } @@ -98,11 +98,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "b3161d019d467d5f9045d2f5d7203006" ); - e.put( "-all_bases", "655c7b8ce2f794a055fa49962b5a6a79" ); - e.put( "--min_base_quality_score 26", "67175e0262e626d46aaf3bc8680cf888" ); - e.put( "--min_mapping_quality_score 26", "8edb24206529a7a344008280ab69cbb2" ); - e.put( "--max_mismatches_in_40bp_window 5", "870812ed23b2c656439dc1c8e6376f89" ); + e.put( "-genotype", "d82b53aad30005330203a7ca07ce83d5" ); + e.put( "-all_bases", "563566eaa2c621e88080149512d368b6" ); + e.put( "--min_base_quality_score 26", "015509aca82954b89d7dc83134f2103e" ); + e.put( "--min_mapping_quality_score 26", "ba7fe92be077750225c762981ead4b0d" ); + e.put( "--max_mismatches_in_40bp_window 5", "abecda3de7b195d6d0f7ff97bc624b5f" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -116,7 +116,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 10 ", 1, - Arrays.asList("eb708a714a0ecbebed707f2d9684e165")); + Arrays.asList("c434515cae99d2da5a353f0b0866af81")); executeTest("testConfidence", spec); }