I want to move over to hpprojects tonight, so I'm checking in various changes all in one go:

1. Initial code for annotating calls with the base mismatch rate within a reference window (still needs analysis).
2. Move error checking code from rodVCF to VCFRecord.
3. More improvements to SNP Genotype callset concordance.
4. Fixed some comments in Variation/Genotype



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-13 02:52:18 +00:00
parent 2748eb60e1
commit bd2a46ab4c
8 changed files with 146 additions and 25 deletions

View File

@ -42,11 +42,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
}
}
public void assertBiAllelic() {
if ( !isBiallelic() )
throw new StingException("This VCF rod is not bi-allelic.");
}
@Override
public boolean parseLine(Object header, String[] parts) throws IOException {
throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
@ -115,7 +110,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*/
public boolean isSNP() {
assertNotNull();
assertBiAllelic();
return mCurrentRecord.isSNP();
}
@ -126,7 +120,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*/
public boolean isInsertion() {
assertNotNull();
assertBiAllelic();
return mCurrentRecord.isInsertion();
}
@ -137,7 +130,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
*/
public boolean isDeletion() {
assertNotNull();
assertBiAllelic();
return mCurrentRecord.isDeletion();
}

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.*;
import java.util.Map;
public class MismatchRate implements VariantAnnotation {
public String annotate(ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, Variation variation) {
int mismatches = 0;
int totalBases = 0;
for ( String sample : stratifiedContexts.keySet() ) {
ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup();
Pair<Integer, Integer> counts = AlignmentUtils.mismatchesInRefWindow(pileup, ref, true);
mismatches += counts.first;
totalBases += counts.second;
}
// sanity check
if ( totalBases == 0 )
return null;
return String.format("%.2f", (double)mismatches/(double)totalBases);
}
public String getKeyName() { return "MR"; }
public String getDescription() { return "MR,1,Float,\"Mismatch Rate of Reads Spanning This Position\""; }
}

View File

@ -30,18 +30,19 @@ public class SNPGenotypeConcordance implements ConcordanceType {
}
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
char refBase = ref.getBase();
Genotype call1 = samplesToRecords.get(sample1);
Genotype call2 = samplesToRecords.get(sample2);
if ( call1 == null || call2 == null ) {
if ( call1 != null && call1.isPointGenotype() ) {
if ( call1 != null && call1.isPointGenotype() && call1.isVariant(refBase) ) {
if ( 10.0 * call1.getNegLog10PError() >= Qscore )
return "set1ConfidentSet2NoCall";
else
return "set2NoCall";
}
else if ( call2 != null && call2.isPointGenotype() ) {
else if ( call2 != null && call2.isPointGenotype() && call2.isVariant(refBase) ) {
if (10.0 * call2.getNegLog10PError() >= Qscore )
return "set1NoCallSet2Confident";
else
@ -50,13 +51,19 @@ public class SNPGenotypeConcordance implements ConcordanceType {
return null;
}
// if either is an indel, skip this site
if ( !call1.isPointGenotype() || !call2.isPointGenotype() )
return null;
double confidence1 = 10.0 * call1.getNegLog10PError();
double confidence2 = 10.0 * call2.getNegLog10PError();
String genotype1 = call1.getBases();
String genotype2 = call2.getBases();
// are they both variant SNPs?
if ( call1.isPointGenotype() && call2.isPointGenotype() ) {
// are they both SNPs?
boolean call1IsVariant = call1.isVariant(refBase);
boolean call2IsVariant = call2.isVariant(refBase);
if ( call1IsVariant && call2IsVariant ) {
// are they confident calls?
boolean conf1 = confidence1 >= Qscore;
@ -87,10 +94,10 @@ public class SNPGenotypeConcordance implements ConcordanceType {
}
// one is variant and the other is ref
else if ( call1.isPointGenotype() && call2.isVariant(ref.getBase()) && confidence1 >= Qscore )
return "set1VariantSet2Ref";
else if ( call2.isPointGenotype() && call1.isVariant(ref.getBase()) && confidence2 >= Qscore )
return "set1RefSet2Variant";
else if ( call1IsVariant )
return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "VariantSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Ref";
else if ( call2IsVariant )
return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "RefSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Variant";
return null;
}

View File

@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.pileup.*;
/**
@ -121,7 +123,7 @@ public class AlignmentUtils {
public static int numMismatches(SAMRecord r, String refSeq, int refIndex) {
int readIdx = 0;
int mismatches = 0;
String readSeq = r.getReadString();
byte[] readSeq = r.getReadBases();
Cigar c = r.getCigar();
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
@ -131,7 +133,7 @@ public class AlignmentUtils {
if ( refIndex >= refSeq.length() )
continue;
char refChr = refSeq.charAt(refIndex);
char readChr = readSeq.charAt(readIdx);
char readChr = (char)readSeq[readIdx];
// Note: we need to count X/N's as mismatches because that's what SAM requires
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
@ -153,6 +155,89 @@ public class AlignmentUtils {
return mismatches;
}
/** Returns the number of mismatches in the pileup within the given reference context.
*
* @param pileup the pileup with reads
* @param ref the reference context
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
* @return a pair where the first value is the mismatches and the second is the total bases within the context
*/
public static Pair<Integer, Integer> mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
Pair<Integer, Integer> mismatches = new Pair<Integer, Integer>(0, 0);
for ( PileupElement p : pileup ) {
Pair<Integer, Integer> mm = mismatchesInRefWindow(p, ref, ignoreTargetSite);
mismatches.first += mm.first;
mismatches.second += mm.second;
}
return mismatches;
}
/** Returns the number of mismatches in the pileup element within the given reference context.
*
* @param p the pileup element
* @param ref the reference context
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
* @return a pair where the first value is the mismatches and the second is the total bases within the context
*/
public static Pair<Integer, Integer> mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
int mismatches = 0;
int totalBases = 0;
GenomeLoc window = ref.getWindow();
char[] refBases = ref.getBases();
byte[] readBases = p.getRead().getReadBases();
Cigar c = p.getRead().getCigar();
int readIndex = 0;
int currentPos = p.getRead().getAlignmentStart();
int refIndex = Math.max(0, currentPos - (int)window.getStart());
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
switch ( ce.getOperator() ) {
case M:
for (int j = 0; j < ce.getLength(); j++, readIndex++, currentPos++) {
// are we past the ref window?
if ( currentPos > window.getStop() )
break;
// are we before the ref window?
if ( currentPos < window.getStart() )
continue;
char refChr = refBases[refIndex++];
// do we need to skip the target site?
if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos )
continue;
totalBases++;
char readChr = (char)readBases[readIndex];
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
mismatches++;
}
break;
case I:
readIndex += ce.getLength();
break;
case D:
currentPos += ce.getLength();
if ( currentPos > window.getStart() )
refIndex += Math.min(ce.getLength(), currentPos - window.getStart());
break;
case S:
readIndex += ce.getLength();
break;
default: throw new StingException("Only M,I,D,S cigar elements are currently supported; there was " + ce.getOperator());
}
}
//System.out.println("There are " + mismatches + " mismatches out of " + totalBases + " bases for read " + p.getRead().getReadName());
return new Pair<Integer, Integer>(mismatches, totalBases);
}
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is

View File

@ -53,9 +53,9 @@ public interface Genotype {
public GenomeLoc getLocation();
/**
* returns true if the genotype is a point genotype, false if it's a indel / deletion
* returns true if the genotype is a point genotype, false if it's an indel
*
* @return true is a SNP
* @return true if this is a SNP
*/
public boolean isPointGenotype();

View File

@ -120,7 +120,7 @@ public interface Variation {
/**
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
* of
* or if we're not bi-allelic
*
* @return a char, representing the alternate base
*/

View File

@ -248,6 +248,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
}
public double getNonRefAlleleFrequency() {
if ( !isBiallelic() )
throw new StingException("This VCF record is not bi-allelic.");
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
} else {

View File

@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testSimpleVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
Arrays.asList("2c7e18901dbf27bac9f36b3dbee063c6"));
Arrays.asList("5c8e4757d2ce46bc50991a171f988327"));
executeTest("testSimpleVenn", spec);
}
@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testSNPConcordance() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("8a17c93572a2c498feeaf8ba172d40d6"));
Arrays.asList("b4904813cdfd37f0a092aa6cadcd3f71"));
executeTest("testSNPConcordance", spec);
}
@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testNWayVenn() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
Arrays.asList("2b38ae235edd10773dbee0bfae036e35"));
Arrays.asList("cf915dc9762d6a44a7bdadc0d7eae9b8"));
executeTest("testNWayVenn", spec);
}
@ -38,7 +38,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
public void testMulti() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
Arrays.asList("4e0f768389028dbd09266ee1802ccd3b"));
Arrays.asList("d046599a2fef386fa0ad5dfc9671c3a9"));
executeTest("testMulti", spec);
}
}