I want to move over to hpprojects tonight, so I'm checking in various changes all in one go:
1. Initial code for annotating calls with the base mismatch rate within a reference window (still needs analysis). 2. Move error checking code from rodVCF to VCFRecord. 3. More improvements to SNP Genotype callset concordance. 4. Fixed some comments in Variation/Genotype git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2748eb60e1
commit
bd2a46ab4c
|
|
@ -42,11 +42,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void assertBiAllelic() {
|
|
||||||
if ( !isBiallelic() )
|
|
||||||
throw new StingException("This VCF rod is not bi-allelic.");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean parseLine(Object header, String[] parts) throws IOException {
|
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||||
throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
|
throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
|
||||||
|
|
@ -115,7 +110,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
||||||
*/
|
*/
|
||||||
public boolean isSNP() {
|
public boolean isSNP() {
|
||||||
assertNotNull();
|
assertNotNull();
|
||||||
assertBiAllelic();
|
|
||||||
return mCurrentRecord.isSNP();
|
return mCurrentRecord.isSNP();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -126,7 +120,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
||||||
*/
|
*/
|
||||||
public boolean isInsertion() {
|
public boolean isInsertion() {
|
||||||
assertNotNull();
|
assertNotNull();
|
||||||
assertBiAllelic();
|
|
||||||
return mCurrentRecord.isInsertion();
|
return mCurrentRecord.isInsertion();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -137,7 +130,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
||||||
*/
|
*/
|
||||||
public boolean isDeletion() {
|
public boolean isDeletion() {
|
||||||
assertNotNull();
|
assertNotNull();
|
||||||
assertBiAllelic();
|
|
||||||
return mCurrentRecord.isDeletion();
|
return mCurrentRecord.isDeletion();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
public class MismatchRate implements VariantAnnotation {
|
||||||
|
|
||||||
|
public String annotate(ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, Variation variation) {
|
||||||
|
int mismatches = 0;
|
||||||
|
int totalBases = 0;
|
||||||
|
for ( String sample : stratifiedContexts.keySet() ) {
|
||||||
|
ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup();
|
||||||
|
Pair<Integer, Integer> counts = AlignmentUtils.mismatchesInRefWindow(pileup, ref, true);
|
||||||
|
mismatches += counts.first;
|
||||||
|
totalBases += counts.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanity check
|
||||||
|
if ( totalBases == 0 )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return String.format("%.2f", (double)mismatches/(double)totalBases);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getKeyName() { return "MR"; }
|
||||||
|
|
||||||
|
public String getDescription() { return "MR,1,Float,\"Mismatch Rate of Reads Spanning This Position\""; }
|
||||||
|
}
|
||||||
|
|
@ -30,18 +30,19 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
|
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
|
||||||
|
char refBase = ref.getBase();
|
||||||
|
|
||||||
Genotype call1 = samplesToRecords.get(sample1);
|
Genotype call1 = samplesToRecords.get(sample1);
|
||||||
Genotype call2 = samplesToRecords.get(sample2);
|
Genotype call2 = samplesToRecords.get(sample2);
|
||||||
|
|
||||||
if ( call1 == null || call2 == null ) {
|
if ( call1 == null || call2 == null ) {
|
||||||
if ( call1 != null && call1.isPointGenotype() ) {
|
if ( call1 != null && call1.isPointGenotype() && call1.isVariant(refBase) ) {
|
||||||
if ( 10.0 * call1.getNegLog10PError() >= Qscore )
|
if ( 10.0 * call1.getNegLog10PError() >= Qscore )
|
||||||
return "set1ConfidentSet2NoCall";
|
return "set1ConfidentSet2NoCall";
|
||||||
else
|
else
|
||||||
return "set2NoCall";
|
return "set2NoCall";
|
||||||
}
|
}
|
||||||
else if ( call2 != null && call2.isPointGenotype() ) {
|
else if ( call2 != null && call2.isPointGenotype() && call2.isVariant(refBase) ) {
|
||||||
if (10.0 * call2.getNegLog10PError() >= Qscore )
|
if (10.0 * call2.getNegLog10PError() >= Qscore )
|
||||||
return "set1NoCallSet2Confident";
|
return "set1NoCallSet2Confident";
|
||||||
else
|
else
|
||||||
|
|
@ -50,13 +51,19 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if either is an indel, skip this site
|
||||||
|
if ( !call1.isPointGenotype() || !call2.isPointGenotype() )
|
||||||
|
return null;
|
||||||
|
|
||||||
double confidence1 = 10.0 * call1.getNegLog10PError();
|
double confidence1 = 10.0 * call1.getNegLog10PError();
|
||||||
double confidence2 = 10.0 * call2.getNegLog10PError();
|
double confidence2 = 10.0 * call2.getNegLog10PError();
|
||||||
String genotype1 = call1.getBases();
|
String genotype1 = call1.getBases();
|
||||||
String genotype2 = call2.getBases();
|
String genotype2 = call2.getBases();
|
||||||
|
|
||||||
// are they both variant SNPs?
|
// are they both SNPs?
|
||||||
if ( call1.isPointGenotype() && call2.isPointGenotype() ) {
|
boolean call1IsVariant = call1.isVariant(refBase);
|
||||||
|
boolean call2IsVariant = call2.isVariant(refBase);
|
||||||
|
if ( call1IsVariant && call2IsVariant ) {
|
||||||
|
|
||||||
// are they confident calls?
|
// are they confident calls?
|
||||||
boolean conf1 = confidence1 >= Qscore;
|
boolean conf1 = confidence1 >= Qscore;
|
||||||
|
|
@ -87,10 +94,10 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
||||||
}
|
}
|
||||||
|
|
||||||
// one is variant and the other is ref
|
// one is variant and the other is ref
|
||||||
else if ( call1.isPointGenotype() && call2.isVariant(ref.getBase()) && confidence1 >= Qscore )
|
else if ( call1IsVariant )
|
||||||
return "set1VariantSet2Ref";
|
return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "VariantSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Ref";
|
||||||
else if ( call2.isPointGenotype() && call1.isVariant(ref.getBase()) && confidence2 >= Qscore )
|
else if ( call2IsVariant )
|
||||||
return "set1RefSet2Variant";
|
return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "RefSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Variant";
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.Cigar;
|
import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.picard.reference.ReferenceSequence;
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -121,7 +123,7 @@ public class AlignmentUtils {
|
||||||
public static int numMismatches(SAMRecord r, String refSeq, int refIndex) {
|
public static int numMismatches(SAMRecord r, String refSeq, int refIndex) {
|
||||||
int readIdx = 0;
|
int readIdx = 0;
|
||||||
int mismatches = 0;
|
int mismatches = 0;
|
||||||
String readSeq = r.getReadString();
|
byte[] readSeq = r.getReadBases();
|
||||||
Cigar c = r.getCigar();
|
Cigar c = r.getCigar();
|
||||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
CigarElement ce = c.getCigarElement(i);
|
CigarElement ce = c.getCigarElement(i);
|
||||||
|
|
@ -131,7 +133,7 @@ public class AlignmentUtils {
|
||||||
if ( refIndex >= refSeq.length() )
|
if ( refIndex >= refSeq.length() )
|
||||||
continue;
|
continue;
|
||||||
char refChr = refSeq.charAt(refIndex);
|
char refChr = refSeq.charAt(refIndex);
|
||||||
char readChr = readSeq.charAt(readIdx);
|
char readChr = (char)readSeq[readIdx];
|
||||||
// Note: we need to count X/N's as mismatches because that's what SAM requires
|
// Note: we need to count X/N's as mismatches because that's what SAM requires
|
||||||
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
||||||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
||||||
|
|
@ -153,6 +155,89 @@ public class AlignmentUtils {
|
||||||
return mismatches;
|
return mismatches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the number of mismatches in the pileup within the given reference context.
|
||||||
|
*
|
||||||
|
* @param pileup the pileup with reads
|
||||||
|
* @param ref the reference context
|
||||||
|
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||||
|
* @return a pair where the first value is the mismatches and the second is the total bases within the context
|
||||||
|
*/
|
||||||
|
public static Pair<Integer, Integer> mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
|
||||||
|
Pair<Integer, Integer> mismatches = new Pair<Integer, Integer>(0, 0);
|
||||||
|
for ( PileupElement p : pileup ) {
|
||||||
|
Pair<Integer, Integer> mm = mismatchesInRefWindow(p, ref, ignoreTargetSite);
|
||||||
|
mismatches.first += mm.first;
|
||||||
|
mismatches.second += mm.second;
|
||||||
|
}
|
||||||
|
return mismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the number of mismatches in the pileup element within the given reference context.
|
||||||
|
*
|
||||||
|
* @param p the pileup element
|
||||||
|
* @param ref the reference context
|
||||||
|
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||||
|
* @return a pair where the first value is the mismatches and the second is the total bases within the context
|
||||||
|
*/
|
||||||
|
public static Pair<Integer, Integer> mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
|
||||||
|
|
||||||
|
int mismatches = 0;
|
||||||
|
int totalBases = 0;
|
||||||
|
|
||||||
|
GenomeLoc window = ref.getWindow();
|
||||||
|
char[] refBases = ref.getBases();
|
||||||
|
byte[] readBases = p.getRead().getReadBases();
|
||||||
|
Cigar c = p.getRead().getCigar();
|
||||||
|
|
||||||
|
int readIndex = 0;
|
||||||
|
int currentPos = p.getRead().getAlignmentStart();
|
||||||
|
int refIndex = Math.max(0, currentPos - (int)window.getStart());
|
||||||
|
|
||||||
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
|
CigarElement ce = c.getCigarElement(i);
|
||||||
|
switch ( ce.getOperator() ) {
|
||||||
|
case M:
|
||||||
|
for (int j = 0; j < ce.getLength(); j++, readIndex++, currentPos++) {
|
||||||
|
// are we past the ref window?
|
||||||
|
if ( currentPos > window.getStop() )
|
||||||
|
break;
|
||||||
|
|
||||||
|
// are we before the ref window?
|
||||||
|
if ( currentPos < window.getStart() )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
char refChr = refBases[refIndex++];
|
||||||
|
|
||||||
|
// do we need to skip the target site?
|
||||||
|
if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
totalBases++;
|
||||||
|
char readChr = (char)readBases[readIndex];
|
||||||
|
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
|
||||||
|
mismatches++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case I:
|
||||||
|
readIndex += ce.getLength();
|
||||||
|
break;
|
||||||
|
case D:
|
||||||
|
currentPos += ce.getLength();
|
||||||
|
if ( currentPos > window.getStart() )
|
||||||
|
refIndex += Math.min(ce.getLength(), currentPos - window.getStart());
|
||||||
|
break;
|
||||||
|
case S:
|
||||||
|
readIndex += ce.getLength();
|
||||||
|
break;
|
||||||
|
default: throw new StingException("Only M,I,D,S cigar elements are currently supported; there was " + ce.getOperator());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.println("There are " + mismatches + " mismatches out of " + totalBases + " bases for read " + p.getRead().getReadName());
|
||||||
|
return new Pair<Integer, Integer>(mismatches, totalBases);
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
||||||
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
||||||
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
|
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
|
||||||
|
|
|
||||||
|
|
@ -53,9 +53,9 @@ public interface Genotype {
|
||||||
public GenomeLoc getLocation();
|
public GenomeLoc getLocation();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* returns true if the genotype is a point genotype, false if it's a indel / deletion
|
* returns true if the genotype is a point genotype, false if it's an indel
|
||||||
*
|
*
|
||||||
* @return true is a SNP
|
* @return true if this is a SNP
|
||||||
*/
|
*/
|
||||||
public boolean isPointGenotype();
|
public boolean isPointGenotype();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -120,7 +120,7 @@ public interface Variation {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
|
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
|
||||||
* of
|
* or if we're not bi-allelic
|
||||||
*
|
*
|
||||||
* @return a char, representing the alternate base
|
* @return a char, representing the alternate base
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -248,6 +248,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
||||||
}
|
}
|
||||||
|
|
||||||
public double getNonRefAlleleFrequency() {
|
public double getNonRefAlleleFrequency() {
|
||||||
|
if ( !isBiallelic() )
|
||||||
|
throw new StingException("This VCF record is not bi-allelic.");
|
||||||
|
|
||||||
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
|
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
|
||||||
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
|
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
||||||
public void testSimpleVenn() {
|
public void testSimpleVenn() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
|
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
|
||||||
Arrays.asList("2c7e18901dbf27bac9f36b3dbee063c6"));
|
Arrays.asList("5c8e4757d2ce46bc50991a171f988327"));
|
||||||
executeTest("testSimpleVenn", spec);
|
executeTest("testSimpleVenn", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
||||||
public void testSNPConcordance() {
|
public void testSNPConcordance() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
|
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||||
Arrays.asList("8a17c93572a2c498feeaf8ba172d40d6"));
|
Arrays.asList("b4904813cdfd37f0a092aa6cadcd3f71"));
|
||||||
executeTest("testSNPConcordance", spec);
|
executeTest("testSNPConcordance", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
||||||
public void testNWayVenn() {
|
public void testNWayVenn() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
|
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
|
||||||
Arrays.asList("2b38ae235edd10773dbee0bfae036e35"));
|
Arrays.asList("cf915dc9762d6a44a7bdadc0d7eae9b8"));
|
||||||
executeTest("testNWayVenn", spec);
|
executeTest("testNWayVenn", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
||||||
public void testMulti() {
|
public void testMulti() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
|
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||||
Arrays.asList("4e0f768389028dbd09266ee1802ccd3b"));
|
Arrays.asList("d046599a2fef386fa0ad5dfc9671c3a9"));
|
||||||
executeTest("testMulti", spec);
|
executeTest("testMulti", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue