System to merge multiple nearby alleles into block substitutions

-- Block substitution algorithm that merges nearby events based on distance.
-- Also does some cleanup of GenotypingEngine
This commit is contained in:
Mark DePristo 2013-03-27 09:36:14 -04:00
parent bff13bb5c5
commit 0310499b65
9 changed files with 707 additions and 209 deletions

View File

@ -432,7 +432,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());

View File

@ -58,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.EventExtractor;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
@ -72,7 +73,6 @@ public class GenotypingEngine {
private final boolean DEBUG;
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
private final VariantAnnotatorEngine annotationEngine;
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) {
@ -145,99 +145,26 @@ public class GenotypingEngine {
final GenomeLocParser genomeLocParser,
final List<VariantContext> activeAllelesToGenotype ) {
// sanity check input arguments
if (UG_engine == null)
throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
if (haplotypes == null || haplotypes.isEmpty())
throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (samples == null || samples.isEmpty())
throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples);
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty())
throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
if (ref == null || ref.length == 0 )
throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length)
throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null )
throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
if (activeAllelesToGenotype == null )
throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
if (genomeLocParser == null )
throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (samples == null || samples.isEmpty()) throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples);
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
int count = 0;
if( DEBUG ) { logger.info("=== Best Haplotypes ==="); }
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( DEBUG ) {
logger.info(h.toString());
logger.info("> Cigar = " + h.getCigar());
logger.info(">> Events = " + h.getEventMap());
}
}
cleanUpSymbolicUnassembledEvents( haplotypes );
if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
}
if( in_GGA_mode ) {
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}
}
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, samples, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
// Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
final List<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
if( !in_GGA_mode ) {
for( final Haplotype h : haplotypes ) {
final Map<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc);
priorityList.add(vc.getSource());
}
}
} else { // we are in GGA mode!
int compCount = 0;
for( final VariantContext compVC : activeAllelesToGenotype ) {
if( compVC.getStart() == loc ) {
int alleleCount = 0;
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
List<Allele> alleleSet = new ArrayList<Allele>(2);
alleleSet.add(compVC.getReference());
alleleSet.add(compAltAllele);
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
// check if this event is already in the list of events due to a repeat in the input alleles track
final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make();
boolean alreadyExists = false;
for( final VariantContext eventToTest : eventsAtThisLoc ) {
if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) {
alreadyExists = true;
}
}
if( !alreadyExists ) {
priorityList.add(vcSourceName);
eventsAtThisLoc.add(candidateEventToAdd);
}
alleleCount++;
}
}
compCount++;
}
}
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
if( eventsAtThisLoc.isEmpty() ) { continue; }
@ -245,7 +172,7 @@ public class GenotypingEngine {
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
// Sanity check the priority list for mistakes
validatePriorityList( priorityList, eventsAtThisLoc );
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
// Merge the event to find a common reference representation
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
@ -264,7 +191,6 @@ public class GenotypingEngine {
if( DEBUG ) {
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
}
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
@ -277,7 +203,6 @@ public class GenotypingEngine {
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
VariantContext annotatedCall = call;
// TODO -- should be before annotated call, so that QDL works correctly
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
}
@ -295,6 +220,117 @@ public class GenotypingEngine {
return new CalledHaplotypes(returnCalls, calledHaplotypes);
}
/**
* Go through the haplotypes we assembled, and decompose them into their constituent variant contexts
*
* @param haplotypes the list of haplotypes we're working with
* @param samples the samples we're working with
* @param haplotypeReadMap map from samples -> the per read allele likelihoods
* @param ref the reference bases (over the same interval as the haplotypes)
* @param refLoc the span of the reference bases
* @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode)
* @return
*/
private TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
final List<String> samples,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final byte[] ref,
final GenomeLoc refLoc,
final List<VariantContext> activeAllelesToGenotype) {
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
int hapNumber = 0;
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
if( DEBUG ) logger.info("=== Best Haplotypes ===");
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( new EventExtractor( h, ref, refLoc, "HC" + hapNumber++ ) );
if( ! in_GGA_mode ) {
startPosKeySet.addAll(h.getEventMap().getStartPositions());
}
if( DEBUG ) {
logger.info(h.toString());
logger.info("> Cigar = " + h.getCigar());
logger.info(">> Events = " + h.getEventMap());
}
}
cleanUpSymbolicUnassembledEvents( haplotypes );
if ( !in_GGA_mode && samples.size() >= 10 ) {
// if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
}
if ( in_GGA_mode ) {
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}
}
return startPosKeySet;
}
/**
* Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view
* @param vcs a list of variant contexts
* @return the list of the sources of vcs in the same order
*/
private List<String> makePriorityList(final List<VariantContext> vcs) {
final List<String> priorityList = new LinkedList<String>();
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
return priorityList;
}
private List<VariantContext> getVCsAtThisLocation(final List<Haplotype> haplotypes,
final int loc,
final List<VariantContext> activeAllelesToGenotype) {
// the overlapping events to merge into a common reference view
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
if( activeAllelesToGenotype.isEmpty() ) {
for( final Haplotype h : haplotypes ) {
final EventExtractor eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc);
}
}
} else { // we are in GGA mode!
int compCount = 0;
for( final VariantContext compVC : activeAllelesToGenotype ) {
if( compVC.getStart() == loc ) {
int alleleCount = 0;
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
List<Allele> alleleSet = new ArrayList<Allele>(2);
alleleSet.add(compVC.getReference());
alleleSet.add(compAltAllele);
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
// check if this event is already in the list of events due to a repeat in the input alleles track
final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make();
boolean alreadyExists = false;
for( final VariantContext eventToTest : eventsAtThisLoc ) {
if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) {
alreadyExists = true;
}
}
if( !alreadyExists ) {
eventsAtThisLoc.add(candidateEventToAdd);
}
alleleCount++;
}
}
compCount++;
}
}
return eventsAtThisLoc;
}
/**
* For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele
* @param samples List of samples to genotype
@ -322,23 +358,6 @@ public class GenotypingEngine {
return genotypes;
}
private void validatePriorityList( final List<String> priorityList, final List<VariantContext> eventsAtThisLoc ) {
for( final VariantContext vc : eventsAtThisLoc ) {
if( !priorityList.contains(vc.getSource()) ) {
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
}
}
for( final String name : priorityList ) {
boolean found = false;
for( final VariantContext vc : eventsAtThisLoc ) {
if(vc.getSource().equals(name)) { found = true; break; }
}
if( !found ) {
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
}
}
}
private static Map<String, PerReadAlleleLikelihoodMap> filterToOnlyOverlappingReads( final GenomeLocParser parser,
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
@ -382,10 +401,10 @@ public class GenotypingEngine {
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().values() ) {
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
if( vc.isSymbolic() ) {
for( final Haplotype h2 : haplotypes ) {
for( final VariantContext vc2 : h2.getEventMap().values() ) {
for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) {
if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events
haplotypesToRemove.add(h);
break;
@ -512,11 +531,10 @@ public class GenotypingEngine {
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
for( final Haplotype h : haplotypes ) {
final Map<Integer, VariantContext> eventMap = h.getEventMap();
if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) {
eventMap.remove(thisStart);
eventMap.remove(nextStart);
eventMap.put(mergedVC.getStart(), mergedVC);
if( h.getEventMap().containsKey(thisStart) && h.getEventMap().containsKey(nextStart) ) {
h.getEventMap().remove(thisStart);
h.getEventMap().remove(nextStart);
h.getEventMap().put(mergedVC.getStart(), mergedVC);
}
}
startPosKeySet.add(mergedVC.getStart());
@ -697,92 +715,9 @@ public class GenotypingEngine {
return eventAllelesForSample;
}
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) {
final Map<Integer,VariantContext> vcs = new LinkedHashMap<Integer,VariantContext>();
int refPos = alignmentStartHapwrtRef;
if( refPos < 0 ) { return null; } // Protection against SW failures
int alignmentPos = 0;
for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
final CigarElement ce = cigar.getCigarElement(cigarIndex);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
{
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
final List<Allele> insertionAlleles = new ArrayList<Allele>();
final int insertionStart = refLoc.getStart() + refPos - 1;
final byte refByte = ref[refPos-1];
if( BaseUtils.isRegularBase(refByte) ) {
insertionAlleles.add( Allele.create(refByte, true) );
}
if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
} else {
byte[] insertionBases = new byte[]{};
insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ));
if( BaseUtils.isAllRegularBases(insertionBases) ) {
insertionAlleles.add( Allele.create(insertionBases, false) );
}
}
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
}
}
alignmentPos += elementLength;
break;
}
case S:
{
alignmentPos += elementLength;
break;
}
case D:
{
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
final List<Allele> deletionAlleles = new ArrayList<Allele>();
final int deletionStart = refLoc.getStart() + refPos - 1;
final byte refByte = ref[refPos-1];
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
deletionAlleles.add( Allele.create(deletionBases, true) );
deletionAlleles.add( Allele.create(refByte, false) );
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
}
}
refPos += elementLength;
break;
}
case M:
case EQ:
case X:
{
for( int iii = 0; iii < elementLength; iii++ ) {
final byte refByte = ref[refPos];
final byte altByte = alignment[alignmentPos];
if( refByte != altByte ) { // SNP!
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
final List<Allele> snpAlleles = new ArrayList<Allele>();
snpAlleles.add( Allele.create( refByte, true ) );
snpAlleles.add( Allele.create( altByte, false ) );
vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
}
}
refPos++;
alignmentPos++;
}
break;
}
case N:
case H:
case P:
default:
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
}
}
return vcs;
@Deprecated
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) {
return new EventExtractor(haplotype, ref, refLoc, sourceNameToAdd);
}
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {

View File

@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
}
// order results by start position
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1));
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2));
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1));
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2));
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
// TODO -- handle errors appropriately
logger.debug("No source alleles; aborting at " + refContext.getLocus());

View File

@ -199,7 +199,8 @@ public class GenotypingEngineUnitTest extends BaseTest {
public Map<Integer,VariantContext> calcAlignment() {
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap);
return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar());
return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
}
}

View File

@ -0,0 +1,307 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.haplotype;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import java.util.*;
/**
* Extract simple VariantContext events from a single haplotype
*
* User: depristo
* Date: 3/27/13
* Time: 8:35 AM
*/
public class EventExtractor extends TreeMap<Integer, VariantContext> {
private final static Logger logger = Logger.getLogger(EventExtractor.class);
private final static boolean mergeClumpedEvents = true;
protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3;
public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
public EventExtractor( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) {
super();
processCigarForInitialEvents(haplotype, ref, refLoc, sourceNameToAdd);
if ( mergeClumpedEvents && getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) {
replaceClumpedEventsWithBlockSubstititions(haplotype, ref, refLoc);
}
}
/**
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
* @param stateForTesting
*/
protected EventExtractor(final Map<Integer, VariantContext> stateForTesting) {
super(stateForTesting);
}
/**
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
* @param stateForTesting
*/
protected EventExtractor(final Collection<VariantContext> stateForTesting) {
for ( final VariantContext vc : stateForTesting )
addVC(vc);
}
protected void processCigarForInitialEvents(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) {
final Cigar cigar = haplotype.getCigar();
final byte[] alignment = haplotype.getBases();
int refPos = haplotype.getAlignmentStartHapwrtRef();
if( refPos < 0 ) {
return;
} // Protection against SW failures
int alignmentPos = 0;
for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
final CigarElement ce = cigar.getCigarElement(cigarIndex);
final int elementLength = ce.getLength();
switch( ce.getOperator() ) {
case I:
{
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
final List<Allele> insertionAlleles = new ArrayList<Allele>();
final int insertionStart = refLoc.getStart() + refPos - 1;
final byte refByte = ref[refPos-1];
if( BaseUtils.isRegularBase(refByte) ) {
insertionAlleles.add( Allele.create(refByte, true) );
}
if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
} else {
byte[] insertionBases = new byte[]{};
insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength));
if( BaseUtils.isAllRegularBases(insertionBases) ) {
insertionAlleles.add( Allele.create(insertionBases, false) );
}
}
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
}
}
alignmentPos += elementLength;
break;
}
case S:
{
alignmentPos += elementLength;
break;
}
case D:
{
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
final List<Allele> deletionAlleles = new ArrayList<Allele>();
final int deletionStart = refLoc.getStart() + refPos - 1;
final byte refByte = ref[refPos-1];
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
deletionAlleles.add( Allele.create(deletionBases, true) );
deletionAlleles.add( Allele.create(refByte, false) );
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
}
}
refPos += elementLength;
break;
}
case M:
case EQ:
case X:
{
for( int iii = 0; iii < elementLength; iii++ ) {
final byte refByte = ref[refPos];
final byte altByte = alignment[alignmentPos];
if( refByte != altByte ) { // SNP!
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
final List<Allele> snpAlleles = new ArrayList<Allele>();
snpAlleles.add( Allele.create( refByte, true ) );
snpAlleles.add( Allele.create( altByte, false ) );
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
}
}
refPos++;
alignmentPos++;
}
break;
}
case N:
case H:
case P:
default:
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
}
}
}
private void addVC(final VariantContext vc) {
addVC(vc, true);
}
private void addVC(final VariantContext vc, final boolean merge) {
if ( containsKey(vc.getStart()) ) {
if ( merge ) {
final VariantContext prev = get(vc.getStart());
put(vc.getStart(), makeBlock(prev, vc));
} else {
throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc);
}
} else
put(vc.getStart(), vc);
}
private VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) {
if ( ! vc1.isSNP() ) throw new IllegalArgumentException("vc1 must be a snp");
Allele ref, alt;
final VariantContextBuilder b = new VariantContextBuilder(vc1);
if ( vc1.getReference().equals(vc2.getReference()) ) {
// we've got an insertion, so we just update the alt to have the prev alt
ref = vc1.getReference();
alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false);
} else {
// we're dealing with a deletion, so we patch the ref
ref = vc2.getReference();
alt = vc1.getAlternateAllele(0);
b.stop(vc2.getEnd());
}
return b.alleles(Arrays.asList(ref, alt)).make();
}
// TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it
@Requires("getNumberOfEvents() > 0")
protected void replaceClumpedEventsWithBlockSubstititions(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) {
int lastStart = -1;
for ( boolean foundOne = true; foundOne; ) {
foundOne = false;
for ( final VariantContext vc : getVariantContexts() ) {
if ( vc.getStart() > lastStart ) {
lastStart = vc.getStart();
final List<VariantContext> neighborhood = getNeighborhood(vc, 10);
if ( updateToBlockSubstitutionIfBetter(neighborhood, haplotype, ref, refLoc) ) {
foundOne = true;
break;
}
}
}
}
}
protected boolean updateToBlockSubstitutionIfBetter(final List<VariantContext> neighbors, final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) {
if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION)
return false;
// TODO -- need more tests to decide if this is really so good
final VariantContext first = neighbors.get(0);
final int refStartOffset = first.getStart() - refLoc.getStart();
final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart();
final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1);
final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar());
final VariantContextBuilder builder = new VariantContextBuilder(first);
builder.stop(first.getStart() + refBases.length - 1);
builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases)));
final VariantContext block = builder.make();
// remove all merged events
for ( final VariantContext merged : neighbors ) {
if ( remove(merged.getStart()) == null )
throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged);
}
// note must be after we remove the previous events as the treeset only allows one key per start
logger.info("Transforming into block substitution at " + block);
addVC(block, false);
return true;
}
/**
* Get all of the variant contexts starting at leftMost that are within maxBP of each other
*
* @param leftMost the left most (smallest position) variant context that will start the neighborhood
* @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next
* to be included the the resulting list
* @return a list that contains at least one element (leftMost)
*/
@Requires({"leftMost != null", "maxBPBetweenEvents >= 0"})
@Ensures({"result != null", "! result.isEmpty()"})
protected List<VariantContext> getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) {
final List<VariantContext> neighbors = new LinkedList<VariantContext>();
VariantContext left = leftMost;
for ( final VariantContext vc : getVariantContexts() ) {
if ( vc.getStart() < leftMost.getStart() )
continue;
if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) {
// this vc is within max distance to the end of the left event, so accumulate it
neighbors.add(vc);
left = vc;
}
}
return neighbors;
}
public Set<Integer> getStartPositions() {
return keySet();
}
public Collection<VariantContext> getVariantContexts() {
return values();
}
public int getNumberOfEvents() {
return size();
}
@Override
public String toString() {
final StringBuilder b = new StringBuilder("EventExtractor{");
for ( final VariantContext vc : getVariantContexts() )
b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles()));
b.append("}");
return b.toString();
}
}

View File

@ -43,7 +43,7 @@ import java.util.*;
public class Haplotype extends Allele {
private GenomeLoc genomeLocation = null;
private Map<Integer, VariantContext> eventMap = null;
private EventExtractor eventMap = null;
private Cigar cigar;
private int alignmentStartHapwrtRef;
private Event artificialEvent = null;
@ -63,6 +63,12 @@ public class Haplotype extends Allele {
this(bases, false);
}
public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) {
this(bases, isRef);
this.alignmentStartHapwrtRef = alignmentStartHapwrtRef;
this.cigar = cigar;
}
/**
* Copy constructor. Note the ref state of the provided allele is ignored!
*
@ -92,11 +98,11 @@ public class Haplotype extends Allele {
return Arrays.hashCode(getBases());
}
public Map<Integer, VariantContext> getEventMap() {
public EventExtractor getEventMap() {
return eventMap;
}
public void setEventMap( final Map<Integer, VariantContext> eventMap ) {
public void setEventMap( final EventExtractor eventMap ) {
this.eventMap = eventMap;
}

View File

@ -48,6 +48,67 @@ public final class AlignmentUtils {
// cannot be instantiated
private AlignmentUtils() { }
/**
* Get the byte[] from bases that cover the reference interval refStart -> refEnd given the
* alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference
*
* refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference
* bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X.
*
* @param bases
* @param refStart
* @param refEnd
* @param basesStartOnRef where does the bases array start w.r.t. the reference start? For example, bases[0] of
* could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at
* 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10.
* It's not trivial to eliminate this parameter because it's tied up with the cigar
* @param basesToRefCigar the cigar that maps the bases to the reference genome
* @return a non-null byte[]
*/
public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) {
if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd);
if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef);
if ( bases == null ) throw new IllegalArgumentException("Bases cannot be null");
if ( basesToRefCigar == null ) throw new IllegalArgumentException("basesToRefCigar cannot be null");
if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar);
int refPos = basesStartOnRef;
int basesPos = 0;
int basesStart = -1;
int basesStop = -1;
boolean done = false;
for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) {
final CigarElement ce = basesToRefCigar.getCigarElement(iii);
final int bInc, rInc;
switch ( ce.getOperator() ) {
case I: bInc = 1; rInc = 0; break;
case M: case X: case EQ: bInc = rInc = 1; break;
case D: bInc = 0; rInc = 1; break;
default:
throw new IllegalStateException("Unsupported operator " + ce);
}
for ( int i = 0; i < ce.getLength(); i++ ) {
if ( refPos == refStart )
basesStart = basesPos;
if ( refPos == refEnd ) {
basesStop = basesPos;
done = true;
break;
}
refPos += rInc;
basesPos += bInc;
}
}
if ( basesStart == -1 || basesStop == -1 )
throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar);
return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
}
/**
* Get the number of bases at which refSeq and readSeq differ, given their alignment
*

View File

@ -1424,4 +1424,21 @@ public class GATKVariantContextUtils {
return result;
}
/**
* Are vc1 and 2 equal including their position and alleles?
* @param vc1 non-null VariantContext
* @param vc2 non-null VariantContext
* @return true if vc1 and vc2 are equal, false otherwise
*/
public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) {
if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null");
if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null");
if ( vc1.getStart() != vc2.getStart() ) return false;
if ( vc1.getEnd() != vc2.getEnd() ) return false;
if ( ! vc1.getChr().equals(vc2.getChr())) return false;
if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false;
return true;
}
}

View File

@ -0,0 +1,171 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.haplotype;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class EventExtractorUnitTest extends BaseTest {
private final static String CHR = "20";
private final static String NAME = "foo";
@DataProvider(name = "MyDataProvider")
public Object[][] makeMyDataProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<String> SNP_ALLELES = Arrays.asList("A", "C");
final List<String> INS_ALLELES = Arrays.asList("A", "ACGTGA");
final List<String> DEL_ALLELES = Arrays.asList("ACGTA", "C");
final List<List<String>> allAlleles = Arrays.asList(SNP_ALLELES, INS_ALLELES, DEL_ALLELES);
for ( final int leftNotClump : Arrays.asList(-1, 3) ) {
for ( final int middleNotClump : Arrays.asList(-1, 10, 500) ) {
for ( final int rightNotClump : Arrays.asList(-1, 1000) ) {
for ( final int nClumped : Arrays.asList(3, 4) ) {
for ( final List<List<String>> alleles : Utils.makePermutations(allAlleles, nClumped, true)) {
final List<VariantContext> allVCS = new LinkedList<VariantContext>();
if ( leftNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, leftNotClump, SNP_ALLELES));
if ( middleNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, middleNotClump, SNP_ALLELES));
if ( rightNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, rightNotClump, SNP_ALLELES));
int clumpStart = 50;
final List<VariantContext> vcs = new LinkedList<VariantContext>();
for ( final List<String> myAlleles : alleles ) {
final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(NAME, CHR, clumpStart, myAlleles);
clumpStart = vc.getEnd() + 3;
vcs.add(vc);
}
tests.add(new Object[]{new EventExtractor(new LinkedList<VariantContext>(allVCS)), Collections.emptyList()});
allVCS.addAll(vcs);
tests.add(new Object[]{new EventExtractor(allVCS), vcs});
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "MyDataProvider", enabled = true) // TODO == reenable
public void testGetNeighborhood(final EventExtractor eventExtractor, final List<VariantContext> expectedNeighbors) {
final VariantContext leftOfNeighors = expectedNeighbors.isEmpty() ? null : expectedNeighbors.get(0);
for ( final VariantContext vc : eventExtractor.getVariantContexts() ) {
final List<VariantContext> n = eventExtractor.getNeighborhood(vc, 5);
if ( leftOfNeighors == vc )
Assert.assertEquals(n, expectedNeighbors);
else if ( ! expectedNeighbors.contains(vc) )
Assert.assertEquals(n, Collections.singletonList(vc), "Should only contain the original vc but " + n);
}
}
@DataProvider(name = "BlockSubstitutionsData")
public Object[][] makeBlockSubstitutionsData() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( int size = EventExtractor.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) {
final String ref = Utils.dupString("A", size);
final String alt = Utils.dupString("C", size);
tests.add(new Object[]{ref, alt, size + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList(ref, alt))});
}
tests.add(new Object[]{"AAAAAA", "GAGAGA", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAA", "GAGAG"))});
tests.add(new Object[]{"AAAAAA", "GAGAGG", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAAA", "GAGAGG"))});
for ( int len = 0; len < 10; len++ ) {
final String s = len == 0 ? "" : Utils.dupString("A", len);
tests.add(new Object[]{s + "AACCCCAA", s + "GAAG", len + 2 + "M4D2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AACCCCAA", "GAAG"))});
tests.add(new Object[]{s + "AAAA", s + "GACCCCAG", len + 2 + "M4I2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AAAA", "GACCCCAG"))});
tests.add(new Object[]{"AACCCCAA" + s, "GAAG" + s, "2M4D" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AACCCCAA", "GAAG"))});
tests.add(new Object[]{"AAAA" + s, "GACCCCAG" + s, "2M4I" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAA", "GACCCCAG"))});
}
return tests.toArray(new Object[][]{});
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "BlockSubstitutionsData")
public void testBlockSubstitutionsData(final String refBases, final String haplotypeBases, final String cigar, final VariantContext expectedBlock) {
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME);
Assert.assertEquals(ee.getNumberOfEvents(), 1);
final VariantContext actual = ee.getVariantContexts().iterator().next();
Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual);
}
@DataProvider(name = "AdjacentSNPIndelTest")
public Object[][] makeAdjacentSNPIndelTest() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{"TT", "GCT", "1M1I1M", Arrays.asList(Arrays.asList("T", "GC"))});
tests.add(new Object[]{"GCT", "TT", "1M1D", Arrays.asList(Arrays.asList("GC", "T"))});
tests.add(new Object[]{"TT", "GCCT", "1M2I1M", Arrays.asList(Arrays.asList("T", "GCC"))});
tests.add(new Object[]{"GCCT", "TT", "1M2D", Arrays.asList(Arrays.asList("GCC", "T"))});
tests.add(new Object[]{"AAGCCT", "AATT", "3M2D", Arrays.asList(Arrays.asList("GCC", "T"))});
tests.add(new Object[]{"AAGCCT", "GATT", "3M2D", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))});
tests.add(new Object[]{"AAAAA", "AGACA", "5M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("A", "C"))});
return tests.toArray(new Object[][]{});
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "AdjacentSNPIndelTest", enabled = true)
public void testAdjacentSNPIndelTest(final String refBases, final String haplotypeBases, final String cigar, final List<List<String>> expectedAlleles) {
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME);
Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size());
final List<VariantContext> actuals = new ArrayList<VariantContext>(ee.getVariantContexts());
for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {
final VariantContext actual = actuals.get(i);
Assert.assertEquals(actual.getReference().getDisplayString(), expectedAlleles.get(i).get(0));
Assert.assertEquals(actual.getAlternateAllele(0).getDisplayString(), expectedAlleles.get(i).get(1));
}
}
}