Bug fix for popular _Duplicate allele added to VariantContext_ error reported on the forum. It seems to be due to lower case bases in the reference being treated as reference mismatches. We would try to turn these mismatches into SNP events, for example c/C. We now uppercase the result from IndexedFastaSequenceFile.getSubsequenceAt()

This commit is contained in:
Ryan Poplin 2012-08-22 14:39:35 -04:00
parent 03017855e4
commit e5cfdb4811
6 changed files with 37 additions and 65 deletions

View File

@ -42,14 +42,12 @@ import java.util.*;
public class GenotypingEngine {
private final boolean DEBUG;
private final int MNP_LOOK_AHEAD;
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
this.DEBUG = DEBUG;
this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD;
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
noCall.add(Allele.NO_CALL);
}
@ -120,7 +118,7 @@ public class GenotypingEngine {
System.out.println( "> Cigar = " + h.getCigar() );
}
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
startPosKeySet.addAll(h.getEventMap().keySet());
}
@ -203,7 +201,7 @@ public class GenotypingEngine {
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( DEBUG ) {
System.out.println( h.toString() );
@ -521,11 +519,7 @@ public class GenotypingEngine {
return false;
}
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code
}
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) {
final HashMap<Integer,VariantContext> vcs = new HashMap<Integer,VariantContext>();
int refPos = alignmentStartHapwrtRef;
@ -543,7 +537,7 @@ public class GenotypingEngine {
if( BaseUtils.isRegularBase(refByte) ) {
insertionAlleles.add( Allele.create(refByte, true) );
}
if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
if( (haplotype.leftBreakPoint != 0 || haplotype.rightBreakPoint != 0) && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
} else {
byte[] insertionBases = new byte[]{};
@ -590,39 +584,16 @@ public class GenotypingEngine {
case EQ:
case X:
{
int numSinceMismatch = -1;
int stopOfMismatch = -1;
int startOfMismatch = -1;
int refPosStartOfMismatch = -1;
for( int iii = 0; iii < elementLength; iii++ ) {
if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) {
// SNP or start of possible MNP
if( stopOfMismatch == -1 ) {
startOfMismatch = alignmentPos;
stopOfMismatch = alignmentPos;
numSinceMismatch = 0;
refPosStartOfMismatch = refPos;
} else {
stopOfMismatch = alignmentPos;
}
}
if( stopOfMismatch != -1) {
numSinceMismatch++;
}
if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
if( BaseUtils.isAllRegularBases(refBases) && BaseUtils.isAllRegularBases(mismatchBases) ) {
final byte refByte = ref[refPos];
final byte altByte = alignment[alignmentPos];
if( refByte != altByte ) { // SNP!
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
snpAlleles.add( Allele.create( refBases, true ) );
snpAlleles.add( Allele.create( mismatchBases, false ) );
final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
snpAlleles.add( Allele.create( refByte, true ) );
snpAlleles.add( Allele.create( altByte, false ) );
vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
}
numSinceMismatch = -1;
stopOfMismatch = -1;
startOfMismatch = -1;
refPosStartOfMismatch = -1;
}
refPos++;
alignmentPos++;

View File

@ -122,10 +122,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
@Hidden
@Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false)
protected int MNP_LOOK_AHEAD = 0;
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 1;
@ -138,7 +134,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
@Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Gap continuation penalty for use in the Pair HMM", required = false)
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10;
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false)
@ -192,21 +188,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@ArgumentCollection
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
private UnifiedGenotyperEngine UG_engine_simple_genotyper = null;
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
protected boolean DEBUG;
// the UG engines
private UnifiedGenotyperEngine UG_engine = null;
private UnifiedGenotyperEngine UG_engine_simple_genotyper = null;
// the assembly engine
LocalAssemblyEngine assemblyEngine = null;
private LocalAssemblyEngine assemblyEngine = null;
// the likelihoods engine
LikelihoodCalculationEngine likelihoodCalculationEngine = null;
private LikelihoodCalculationEngine likelihoodCalculationEngine = null;
// the genotyping engine
GenotypingEngine genotypingEngine = null;
private GenotypingEngine genotypingEngine = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
@ -292,7 +288,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
}
//---------------------------------------------------------------------------------------------------------------

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
@ -337,8 +338,8 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
}
// order results by start position
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0));
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0));
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1));
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2));
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
// TODO -- handle errors appropriately
logger.debug("No source alleles; aborting at " + refContext.getLocus());

View File

@ -292,7 +292,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {

View File

@ -142,7 +142,6 @@ public class GenotypingEngineUnitTest extends BaseTest {
byte[] ref;
byte[] hap;
HashMap<Integer,Byte> expected;
GenotypingEngine ge = new GenotypingEngine(false, 0, false);
public BasicGenotypingTestProvider(String refString, String hapString, HashMap<Integer, Byte> expected) {
super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString));
@ -153,7 +152,7 @@ public class GenotypingEngineUnitTest extends BaseTest {
public HashMap<Integer,VariantContext> calcAlignment() {
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap);
return ge.generateVCsFromAlignment( alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name", 0);
return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.activeregion;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.HasGenomeLocation;
@ -58,9 +59,7 @@ public class ActiveRegion implements HasGenomeLocation, Comparable<ActiveRegion>
}
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
return referenceReader.getSubsequenceAt( extendedLoc.getContig(),
Math.max(1, extendedLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(extendedLoc.getContig()).getSequenceLength(), extendedLoc.getStop() + padding) ).getBases();
return getReference( referenceReader, padding, extendedLoc );
}
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
@ -68,9 +67,15 @@ public class ActiveRegion implements HasGenomeLocation, Comparable<ActiveRegion>
}
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(),
Math.max(1, fullExtentReferenceLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
return getReference( referenceReader, padding, fullExtentReferenceLoc );
}
private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(),
Math.max(1, genomeLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
StringUtil.toUpperCase(reference);
return reference;
}
@Override