I want to move over to hpprojects tonight, so I'm checking in various changes all in one go:
1. Initial code for annotating calls with the base mismatch rate within a reference window (still needs analysis). 2. Move error checking code from rodVCF to VCFRecord. 3. More improvements to SNP Genotype callset concordance. 4. Fixed some comments in Variation/Genotype git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2748eb60e1
commit
bd2a46ab4c
|
|
@ -42,11 +42,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
}
|
||||
}
|
||||
|
||||
public void assertBiAllelic() {
|
||||
if ( !isBiallelic() )
|
||||
throw new StingException("This VCF rod is not bi-allelic.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||
throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
|
||||
|
|
@ -115,7 +110,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
*/
|
||||
public boolean isSNP() {
|
||||
assertNotNull();
|
||||
assertBiAllelic();
|
||||
return mCurrentRecord.isSNP();
|
||||
}
|
||||
|
||||
|
|
@ -126,7 +120,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
*/
|
||||
public boolean isInsertion() {
|
||||
assertNotNull();
|
||||
assertBiAllelic();
|
||||
return mCurrentRecord.isInsertion();
|
||||
}
|
||||
|
||||
|
|
@ -137,7 +130,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
*/
|
||||
public boolean isDeletion() {
|
||||
assertNotNull();
|
||||
assertBiAllelic();
|
||||
return mCurrentRecord.isDeletion();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class MismatchRate implements VariantAnnotation {
|
||||
|
||||
public String annotate(ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, Variation variation) {
|
||||
int mismatches = 0;
|
||||
int totalBases = 0;
|
||||
for ( String sample : stratifiedContexts.keySet() ) {
|
||||
ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup();
|
||||
Pair<Integer, Integer> counts = AlignmentUtils.mismatchesInRefWindow(pileup, ref, true);
|
||||
mismatches += counts.first;
|
||||
totalBases += counts.second;
|
||||
}
|
||||
|
||||
// sanity check
|
||||
if ( totalBases == 0 )
|
||||
return null;
|
||||
|
||||
return String.format("%.2f", (double)mismatches/(double)totalBases);
|
||||
}
|
||||
|
||||
public String getKeyName() { return "MR"; }
|
||||
|
||||
public String getDescription() { return "MR,1,Float,\"Mismatch Rate of Reads Spanning This Position\""; }
|
||||
}
|
||||
|
|
@ -30,18 +30,19 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
|||
}
|
||||
|
||||
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref) {
|
||||
char refBase = ref.getBase();
|
||||
|
||||
Genotype call1 = samplesToRecords.get(sample1);
|
||||
Genotype call2 = samplesToRecords.get(sample2);
|
||||
|
||||
if ( call1 == null || call2 == null ) {
|
||||
if ( call1 != null && call1.isPointGenotype() ) {
|
||||
if ( call1 != null && call1.isPointGenotype() && call1.isVariant(refBase) ) {
|
||||
if ( 10.0 * call1.getNegLog10PError() >= Qscore )
|
||||
return "set1ConfidentSet2NoCall";
|
||||
else
|
||||
return "set2NoCall";
|
||||
}
|
||||
else if ( call2 != null && call2.isPointGenotype() ) {
|
||||
else if ( call2 != null && call2.isPointGenotype() && call2.isVariant(refBase) ) {
|
||||
if (10.0 * call2.getNegLog10PError() >= Qscore )
|
||||
return "set1NoCallSet2Confident";
|
||||
else
|
||||
|
|
@ -50,13 +51,19 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
|||
return null;
|
||||
}
|
||||
|
||||
// if either is an indel, skip this site
|
||||
if ( !call1.isPointGenotype() || !call2.isPointGenotype() )
|
||||
return null;
|
||||
|
||||
double confidence1 = 10.0 * call1.getNegLog10PError();
|
||||
double confidence2 = 10.0 * call2.getNegLog10PError();
|
||||
String genotype1 = call1.getBases();
|
||||
String genotype2 = call2.getBases();
|
||||
|
||||
// are they both variant SNPs?
|
||||
if ( call1.isPointGenotype() && call2.isPointGenotype() ) {
|
||||
// are they both SNPs?
|
||||
boolean call1IsVariant = call1.isVariant(refBase);
|
||||
boolean call2IsVariant = call2.isVariant(refBase);
|
||||
if ( call1IsVariant && call2IsVariant ) {
|
||||
|
||||
// are they confident calls?
|
||||
boolean conf1 = confidence1 >= Qscore;
|
||||
|
|
@ -87,10 +94,10 @@ public class SNPGenotypeConcordance implements ConcordanceType {
|
|||
}
|
||||
|
||||
// one is variant and the other is ref
|
||||
else if ( call1.isPointGenotype() && call2.isVariant(ref.getBase()) && confidence1 >= Qscore )
|
||||
return "set1VariantSet2Ref";
|
||||
else if ( call2.isPointGenotype() && call1.isVariant(ref.getBase()) && confidence2 >= Qscore )
|
||||
return "set1RefSet2Variant";
|
||||
else if ( call1IsVariant )
|
||||
return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "VariantSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Ref";
|
||||
else if ( call2IsVariant )
|
||||
return "set1" + (confidence1 >= Qscore ? "Confident" : "") + "RefSet2" + (confidence2 >= Qscore ? "Confident" : "") + "Variant";
|
||||
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -121,7 +123,7 @@ public class AlignmentUtils {
|
|||
public static int numMismatches(SAMRecord r, String refSeq, int refIndex) {
|
||||
int readIdx = 0;
|
||||
int mismatches = 0;
|
||||
String readSeq = r.getReadString();
|
||||
byte[] readSeq = r.getReadBases();
|
||||
Cigar c = r.getCigar();
|
||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
|
|
@ -131,7 +133,7 @@ public class AlignmentUtils {
|
|||
if ( refIndex >= refSeq.length() )
|
||||
continue;
|
||||
char refChr = refSeq.charAt(refIndex);
|
||||
char readChr = readSeq.charAt(readIdx);
|
||||
char readChr = (char)readSeq[readIdx];
|
||||
// Note: we need to count X/N's as mismatches because that's what SAM requires
|
||||
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
||||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
||||
|
|
@ -153,6 +155,89 @@ public class AlignmentUtils {
|
|||
return mismatches;
|
||||
}
|
||||
|
||||
/** Returns the number of mismatches in the pileup within the given reference context.
|
||||
*
|
||||
* @param pileup the pileup with reads
|
||||
* @param ref the reference context
|
||||
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||
* @return a pair where the first value is the mismatches and the second is the total bases within the context
|
||||
*/
|
||||
public static Pair<Integer, Integer> mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
|
||||
Pair<Integer, Integer> mismatches = new Pair<Integer, Integer>(0, 0);
|
||||
for ( PileupElement p : pileup ) {
|
||||
Pair<Integer, Integer> mm = mismatchesInRefWindow(p, ref, ignoreTargetSite);
|
||||
mismatches.first += mm.first;
|
||||
mismatches.second += mm.second;
|
||||
}
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
/** Returns the number of mismatches in the pileup element within the given reference context.
|
||||
*
|
||||
* @param p the pileup element
|
||||
* @param ref the reference context
|
||||
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||
* @return a pair where the first value is the mismatches and the second is the total bases within the context
|
||||
*/
|
||||
public static Pair<Integer, Integer> mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
|
||||
|
||||
int mismatches = 0;
|
||||
int totalBases = 0;
|
||||
|
||||
GenomeLoc window = ref.getWindow();
|
||||
char[] refBases = ref.getBases();
|
||||
byte[] readBases = p.getRead().getReadBases();
|
||||
Cigar c = p.getRead().getCigar();
|
||||
|
||||
int readIndex = 0;
|
||||
int currentPos = p.getRead().getAlignmentStart();
|
||||
int refIndex = Math.max(0, currentPos - (int)window.getStart());
|
||||
|
||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch ( ce.getOperator() ) {
|
||||
case M:
|
||||
for (int j = 0; j < ce.getLength(); j++, readIndex++, currentPos++) {
|
||||
// are we past the ref window?
|
||||
if ( currentPos > window.getStop() )
|
||||
break;
|
||||
|
||||
// are we before the ref window?
|
||||
if ( currentPos < window.getStart() )
|
||||
continue;
|
||||
|
||||
char refChr = refBases[refIndex++];
|
||||
|
||||
// do we need to skip the target site?
|
||||
if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos )
|
||||
continue;
|
||||
|
||||
totalBases++;
|
||||
char readChr = (char)readBases[readIndex];
|
||||
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
|
||||
mismatches++;
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
readIndex += ce.getLength();
|
||||
break;
|
||||
case D:
|
||||
currentPos += ce.getLength();
|
||||
if ( currentPos > window.getStart() )
|
||||
refIndex += Math.min(ce.getLength(), currentPos - window.getStart());
|
||||
break;
|
||||
case S:
|
||||
readIndex += ce.getLength();
|
||||
break;
|
||||
default: throw new StingException("Only M,I,D,S cigar elements are currently supported; there was " + ce.getOperator());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//System.out.println("There are " + mismatches + " mismatches out of " + totalBases + " bases for read " + p.getRead().getReadName());
|
||||
return new Pair<Integer, Integer>(mismatches, totalBases);
|
||||
}
|
||||
|
||||
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
||||
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
||||
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
|
||||
|
|
|
|||
|
|
@ -53,9 +53,9 @@ public interface Genotype {
|
|||
public GenomeLoc getLocation();
|
||||
|
||||
/**
|
||||
* returns true if the genotype is a point genotype, false if it's a indel / deletion
|
||||
* returns true if the genotype is a point genotype, false if it's an indel
|
||||
*
|
||||
* @return true is a SNP
|
||||
* @return true if this is a SNP
|
||||
*/
|
||||
public boolean isPointGenotype();
|
||||
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ public interface Variation {
|
|||
|
||||
/**
|
||||
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
|
||||
* of
|
||||
* or if we're not bi-allelic
|
||||
*
|
||||
* @return a char, representing the alternate base
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -248,6 +248,9 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
|
|||
}
|
||||
|
||||
public double getNonRefAlleleFrequency() {
|
||||
if ( !isBiallelic() )
|
||||
throw new StingException("This VCF record is not bi-allelic.");
|
||||
|
||||
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
|
||||
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testSimpleVenn() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn", 1,
|
||||
Arrays.asList("2c7e18901dbf27bac9f36b3dbee063c6"));
|
||||
Arrays.asList("5c8e4757d2ce46bc50991a171f988327"));
|
||||
executeTest("testSimpleVenn", spec);
|
||||
}
|
||||
|
||||
|
|
@ -22,7 +22,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testSNPConcordance() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||
Arrays.asList("8a17c93572a2c498feeaf8ba172d40d6"));
|
||||
Arrays.asList("b4904813cdfd37f0a092aa6cadcd3f71"));
|
||||
executeTest("testSNPConcordance", spec);
|
||||
}
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testNWayVenn() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -B set3,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/CEU.sample.vcf -CT NWayVenn", 1,
|
||||
Arrays.asList("2b38ae235edd10773dbee0bfae036e35"));
|
||||
Arrays.asList("cf915dc9762d6a44a7bdadc0d7eae9b8"));
|
||||
executeTest("testNWayVenn", spec);
|
||||
}
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ public class CallsetConcordanceIntegrationTest extends WalkerTest {
|
|||
public void testMulti() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -B set1,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example1.vcf -B set2,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.example2.vcf -CT SimpleVenn -CT NWayVenn -CT SNPGenotypeConcordance:qscore=5", 1,
|
||||
Arrays.asList("4e0f768389028dbd09266ee1802ccd3b"));
|
||||
Arrays.asList("d046599a2fef386fa0ad5dfc9671c3a9"));
|
||||
executeTest("testMulti", spec);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue