From bd2a46ab4cd88112dd1c2fb85e5f63a568c00e7e Mon Sep 17 00:00:00 2001 From: ebanks Date: Sun, 13 Dec 2009 02:52:18 +0000 Subject: [PATCH] I want to move over to hpprojects tonight, so I'm checking in various changes all in one go: 1. Initial code for annotating calls with the base mismatch rate within a reference window (still needs analysis). 2. Move error checking code from rodVCF to VCFRecord. 3. More improvements to SNP Genotype callset concordance. 4. Fixed some comments in Variation/Genotype git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2341 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/RodVCF.java | 8 -- .../gatk/walkers/annotator/MismatchRate.java | 34 +++++++ .../concordance/SNPGenotypeConcordance.java | 23 +++-- .../sting/utils/AlignmentUtils.java | 89 ++++++++++++++++++- .../sting/utils/genotype/Genotype.java | 4 +- .../sting/utils/genotype/Variation.java | 2 +- .../sting/utils/genotype/vcf/VCFRecord.java | 3 + .../CallsetConcordanceIntegrationTest.java | 8 +- 8 files changed, 146 insertions(+), 25 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java index bc6a2b44d..b6174fe98 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java @@ -42,11 +42,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, } } - public void assertBiAllelic() { - if ( !isBiallelic() ) - throw new StingException("This VCF rod is not bi-allelic."); - } - @Override public boolean parseLine(Object header, String[] parts) throws IOException { throw new UnsupportedOperationException("RodVCF does not support the parseLine method"); @@ -115,7 +110,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, */ public boolean isSNP() { assertNotNull(); - assertBiAllelic(); return mCurrentRecord.isSNP(); } @@ -126,7 +120,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, */ public boolean isInsertion() { assertNotNull(); - assertBiAllelic(); return mCurrentRecord.isInsertion(); } @@ -137,7 +130,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, */ public boolean isDeletion() { assertNotNull(); - assertBiAllelic(); return mCurrentRecord.isDeletion(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java new file mode 100755 index 000000000..9869f443b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.utils.*; + +import java.util.Map; + + +public class MismatchRate implements VariantAnnotation { + + public String annotate(ReferenceContext ref, Map stratifiedContexts, Variation variation) { + int mismatches = 0; + int totalBases = 0; + for ( String sample : stratifiedContexts.keySet() ) { + ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(); + Pair counts = AlignmentUtils.mismatchesInRefWindow(pileup, ref, true); + mismatches += counts.first; + totalBases += counts.second; + } + + // sanity check + if ( totalBases == 0 ) + return null; + + return String.format("%.2f", (double)mismatches/(double)totalBases); + } + + public String getKeyName() { return "MR"; } + + public String getDescription() { return "MR,1,Float,\"Mismatch Rate of Reads Spanning This Position\""; } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/SNPGenotypeConcordance.java b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/SNPGenotypeConcordance.java index 075333b46..4f3db620b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/SNPGenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/SNPGenotypeConcordance.java @@ -30,18 +30,19 @@ public class SNPGenotypeConcordance implements ConcordanceType { } public String computeConcordance(Map samplesToRecords, ReferenceContext ref) { + char refBase = ref.getBase(); Genotype call1 = samplesToRecords.get(sample1); Genotype call2 = samplesToRecords.get(sample2); if ( call1 == null || call2 == null ) { - if ( call1 != null && call1.isPointGenotype() ) { + if ( call1 != null && call1.isPointGenotype() && call1.isVariant(refBase) ) { if ( 10.0 * call1.getNegLog10PError() >= Qscore ) return "set1ConfidentSet2NoCall"; else return "set2NoCall"; } - else if ( call2 != null && call2.isPointGenotype() ) { + else if ( call2 != null && call2.isPointGenotype() && call2.isVariant(refBase) ) { if (10.0 * call2.getNegLog10PError() >= Qscore ) return "set1NoCallSet2Confident"; else @@ -50,13 +51,19 @@ public class SNPGenotypeConcordance implements ConcordanceType { return null; } + // if either is an indel, skip this site + if ( !call1.isPointGenotype() || !call2.isPointGenotype() ) + return null; + double confidence1 = 10.0 * call1.getNegLog10PError(); double confidence2 = 10.0 * call2.getNegLog10PError(); String genotype1 = call1.getBases(); String genotype2 = call2.getBases(); - // are they both variant SNPs? - if ( call1.isPointGenotype() && call2.isPointGenotype() ) { + // are they both SNPs? + boolean call1IsVariant = call1.isVariant(refBase); + boolean call2IsVariant = call2.isVariant(refBase); + if ( call1IsVariant && call2IsVariant ) { // are they confident calls? boolean conf1 = confidence1 >= Qscore; @@ -87,10 +94,10 @@ public class SNPGenotypeConcordance implements ConcordanceType { } // one is variant and the other is ref - else if ( call1.isPointGenotype() && call2.isVariant(ref.getBase()) && confidence1 >= Qscore ) - return "set1VariantSet2Ref"; - else if ( call2.isPointGenotype() && call1.isVariant(ref.getBase()) && confidence2 >= Qscore ) - return "set1RefSet2Variant"; + else if ( call1IsVariant ) + return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "VariantSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Ref"; + else if ( call2IsVariant ) + return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "RefSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Variant"; return null; } diff --git a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java index 28ee7caa4..287afc5d7 100644 --- a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java @@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.pileup.*; /** @@ -121,7 +123,7 @@ public class AlignmentUtils { public static int numMismatches(SAMRecord r, String refSeq, int refIndex) { int readIdx = 0; int mismatches = 0; - String readSeq = r.getReadString(); + byte[] readSeq = r.getReadBases(); Cigar c = r.getCigar(); for (int i = 0 ; i < c.numCigarElements() ; i++) { CigarElement ce = c.getCigarElement(i); @@ -131,7 +133,7 @@ public class AlignmentUtils { if ( refIndex >= refSeq.length() ) continue; char refChr = refSeq.charAt(refIndex); - char readChr = readSeq.charAt(readIdx); + char readChr = (char)readSeq[readIdx]; // Note: we need to count X/N's as mismatches because that's what SAM requires //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) @@ -153,6 +155,89 @@ public class AlignmentUtils { return mismatches; } + /** Returns the number of mismatches in the pileup within the given reference context. + * + * @param pileup the pileup with reads + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return a pair where the first value is the mismatches and the second is the total bases within the context + */ + public static Pair mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { + Pair mismatches = new Pair(0, 0); + for ( PileupElement p : pileup ) { + Pair mm = mismatchesInRefWindow(p, ref, ignoreTargetSite); + mismatches.first += mm.first; + mismatches.second += mm.second; + } + return mismatches; + } + + /** Returns the number of mismatches in the pileup element within the given reference context. + * + * @param p the pileup element + * @param ref the reference context + * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) + * @return a pair where the first value is the mismatches and the second is the total bases within the context + */ + public static Pair mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { + + int mismatches = 0; + int totalBases = 0; + + GenomeLoc window = ref.getWindow(); + char[] refBases = ref.getBases(); + byte[] readBases = p.getRead().getReadBases(); + Cigar c = p.getRead().getCigar(); + + int readIndex = 0; + int currentPos = p.getRead().getAlignmentStart(); + int refIndex = Math.max(0, currentPos - (int)window.getStart()); + + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + switch ( ce.getOperator() ) { + case M: + for (int j = 0; j < ce.getLength(); j++, readIndex++, currentPos++) { + // are we past the ref window? + if ( currentPos > window.getStop() ) + break; + + // are we before the ref window? + if ( currentPos < window.getStart() ) + continue; + + char refChr = refBases[refIndex++]; + + // do we need to skip the target site? + if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) + continue; + + totalBases++; + char readChr = (char)readBases[readIndex]; + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mismatches++; + } + break; + case I: + readIndex += ce.getLength(); + break; + case D: + currentPos += ce.getLength(); + if ( currentPos > window.getStart() ) + refIndex += Math.min(ce.getLength(), currentPos - window.getStart()); + break; + case S: + readIndex += ce.getLength(); + break; + default: throw new StingException("Only M,I,D,S cigar elements are currently supported; there was " + ce.getOperator()); + } + + } + + //System.out.println("There are " + mismatches + " mismatches out of " + totalBases + " bases for read " + p.getRead().getReadName()); + return new Pair(mismatches, totalBases); + } + /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java index 9da4be99c..977d14df3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java @@ -53,9 +53,9 @@ public interface Genotype { public GenomeLoc getLocation(); /** - * returns true if the genotype is a point genotype, false if it's a indel / deletion + * returns true if the genotype is a point genotype, false if it's an indel * - * @return true is a SNP + * @return true if this is a SNP */ public boolean isPointGenotype(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Variation.java b/java/src/org/broadinstitute/sting/utils/genotype/Variation.java index 0f4dad91a..2d5046a31 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Variation.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Variation.java @@ -120,7 +120,7 @@ public interface Variation { /** * gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP - * of + * or if we're not bi-allelic * * @return a char, representing the alternate base */ diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index b8a437b36..c6d029d51 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -248,6 +248,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { } public double getNonRefAlleleFrequency() { + if ( !isBiallelic() ) + throw new StingException("This VCF record is not bi-allelic."); + if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) { return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY)); } else { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java index f0961dcff..6f334920c 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceIntegrationTest.java @@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testSimpleVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1, - Arrays.asList("2c7e18901dbf27bac9f36b3dbee063c6")); + Arrays.asList("5c8e4757d2ce46bc50991a171f988327")); executeTest("testSimpleVenn", spec); } @@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testSNPConcordance() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1, - Arrays.asList("8a17c93572a2c498feeaf8ba172d40d6")); + Arrays.asList("b4904813cdfd37f0a092aa6cadcd3f71")); executeTest("testSNPConcordance", spec); } @@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testNWayVenn() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1, - Arrays.asList("2b38ae235edd10773dbee0bfae036e35")); + Arrays.asList("cf915dc9762d6a44a7bdadc0d7eae9b8")); executeTest("testNWayVenn", spec); } @@ -38,7 +38,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest { public void testMulti() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1, - Arrays.asList("4e0f768389028dbd09266ee1802ccd3b")); + Arrays.asList("d046599a2fef386fa0ad5dfc9671c3a9")); executeTest("testMulti", spec); } } \ No newline at end of file