From 0310499b656f4a6bc41b46c64abaabd5e473d984 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Mar 2013 09:36:14 -0400 Subject: [PATCH] System to merge multiple nearby alleles into block substitutions -- Block substitution algorithm that merges nearby events based on distance. -- Also does some cleanup of GenotypingEngine --- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 339 +++++++----------- .../haplotypecaller/HaplotypeResolver.java | 4 +- .../GenotypingEngineUnitTest.java | 3 +- .../sting/utils/haplotype/EventExtractor.java | 307 ++++++++++++++++ .../sting/utils/haplotype/Haplotype.java | 12 +- .../sting/utils/sam/AlignmentUtils.java | 61 ++++ .../variant/GATKVariantContextUtils.java | 17 + .../haplotype/EventExtractorUnitTest.java | 171 +++++++++ 9 files changed, 707 insertions(+), 209 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 9bc0713c0..1fd2b9c00 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -432,7 +432,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { - final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place + final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 34d81d405..8e76b6ea6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -58,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventExtractor; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; @@ -72,7 +73,6 @@ public class GenotypingEngine { private final boolean DEBUG; private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied - private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); private final VariantAnnotatorEngine annotationEngine; public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) { @@ -145,99 +145,26 @@ public class GenotypingEngine { final GenomeLocParser genomeLocParser, final List activeAllelesToGenotype ) { // sanity check input arguments - if (UG_engine == null) - throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); - if (haplotypes == null || haplotypes.isEmpty()) - throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); - if (samples == null || samples.isEmpty()) - throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples); - if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) - throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); - if (ref == null || ref.length == 0 ) - throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) - throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); - if (activeRegionWindow == null ) - throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) - throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) - throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); + if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); + if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + if (samples == null || samples.isEmpty()) throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples); + if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); + if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); + if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); + if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - final List returnCalls = new ArrayList(); - final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { logger.info("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } - if( DEBUG ) { - logger.info(h.toString()); - logger.info("> Cigar = " + h.getCigar()); - logger.info(">> Events = " + h.getEventMap()); - } - } - - cleanUpSymbolicUnassembledEvents( haplotypes ); - if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); - cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events - } - if( in_GGA_mode ) { - for( final VariantContext compVC : activeAllelesToGenotype ) { - startPosKeySet.add( compVC.getStart() ); - } - } - - final Set calledHaplotypes = new HashSet(); + // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference + // that carry events among the haplotypes + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, samples, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); // Walk along each position in the key set and create each event to be outputted + final Set calledHaplotypes = new HashSet(); + final List returnCalls = new ArrayList(); for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view - final List priorityList = new ArrayList(); // used to merge overlapping events into common reference view - - if( !in_GGA_mode ) { - for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - priorityList.add(vc.getSource()); - } - } - } else { // we are in GGA mode! - int compCount = 0; - for( final VariantContext compVC : activeAllelesToGenotype ) { - if( compVC.getStart() == loc ) { - int alleleCount = 0; - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - List alleleSet = new ArrayList(2); - alleleSet.add(compVC.getReference()); - alleleSet.add(compAltAllele); - final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; - // check if this event is already in the list of events due to a repeat in the input alleles track - final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); - boolean alreadyExists = false; - for( final VariantContext eventToTest : eventsAtThisLoc ) { - if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { - alreadyExists = true; - } - } - if( !alreadyExists ) { - priorityList.add(vcSourceName); - eventsAtThisLoc.add(candidateEventToAdd); - } - alleleCount++; - } - } - compCount++; - } - } + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); if( eventsAtThisLoc.isEmpty() ) { continue; } @@ -245,7 +172,7 @@ public class GenotypingEngine { final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); // Sanity check the priority list for mistakes - validatePriorityList( priorityList, eventsAtThisLoc ); + final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); @@ -264,7 +191,6 @@ public class GenotypingEngine { if( DEBUG ) { logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); @@ -277,7 +203,6 @@ public class GenotypingEngine { final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = call; - // TODO -- should be before annotated call, so that QDL works correctly if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); } @@ -295,6 +220,117 @@ public class GenotypingEngine { return new CalledHaplotypes(returnCalls, calledHaplotypes); } + /** + * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts + * + * @param haplotypes the list of haplotypes we're working with + * @param samples the samples we're working with + * @param haplotypeReadMap map from samples -> the per read allele likelihoods + * @param ref the reference bases (over the same interval as the haplotypes) + * @param refLoc the span of the reference bases + * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) + * @return + */ + private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final byte[] ref, + final GenomeLoc refLoc, + final List activeAllelesToGenotype) { + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); + int hapNumber = 0; + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + + if( DEBUG ) logger.info("=== Best Haplotypes ==="); + for( final Haplotype h : haplotypes ) { + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( new EventExtractor( h, ref, refLoc, "HC" + hapNumber++ ) ); + if( ! in_GGA_mode ) { + startPosKeySet.addAll(h.getEventMap().getStartPositions()); + } + + if( DEBUG ) { + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); + } + } + + cleanUpSymbolicUnassembledEvents( haplotypes ); + if ( !in_GGA_mode && samples.size() >= 10 ) { + // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); + cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events + } + + if ( in_GGA_mode ) { + for( final VariantContext compVC : activeAllelesToGenotype ) { + startPosKeySet.add( compVC.getStart() ); + } + } + + return startPosKeySet; + } + + /** + * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view + * @param vcs a list of variant contexts + * @return the list of the sources of vcs in the same order + */ + private List makePriorityList(final List vcs) { + final List priorityList = new LinkedList(); + for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); + + return priorityList; + } + + private List getVCsAtThisLocation(final List haplotypes, + final int loc, + final List activeAllelesToGenotype) { + // the overlapping events to merge into a common reference view + final List eventsAtThisLoc = new ArrayList(); + + if( activeAllelesToGenotype.isEmpty() ) { + for( final Haplotype h : haplotypes ) { + final EventExtractor eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + } else { // we are in GGA mode! + int compCount = 0; + for( final VariantContext compVC : activeAllelesToGenotype ) { + if( compVC.getStart() == loc ) { + int alleleCount = 0; + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + List alleleSet = new ArrayList(2); + alleleSet.add(compVC.getReference()); + alleleSet.add(compAltAllele); + final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; + // check if this event is already in the list of events due to a repeat in the input alleles track + final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); + boolean alreadyExists = false; + for( final VariantContext eventToTest : eventsAtThisLoc ) { + if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { + alreadyExists = true; + } + } + if( !alreadyExists ) { + eventsAtThisLoc.add(candidateEventToAdd); + } + alleleCount++; + } + } + compCount++; + } + } + + return eventsAtThisLoc; + } + /** * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele * @param samples List of samples to genotype @@ -322,23 +358,6 @@ public class GenotypingEngine { return genotypes; } - private void validatePriorityList( final List priorityList, final List eventsAtThisLoc ) { - for( final VariantContext vc : eventsAtThisLoc ) { - if( !priorityList.contains(vc.getSource()) ) { - throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); - } - } - for( final String name : priorityList ) { - boolean found = false; - for( final VariantContext vc : eventsAtThisLoc ) { - if(vc.getSource().equals(name)) { found = true; break; } - } - if( !found ) { - throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles."); - } - } - } - private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, final Map perSampleReadMap, final Map> perSampleFilteredReadList, @@ -382,10 +401,10 @@ public class GenotypingEngine { protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final List haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { - for( final VariantContext vc : h.getEventMap().values() ) { + for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { if( vc.isSymbolic() ) { for( final Haplotype h2 : haplotypes ) { - for( final VariantContext vc2 : h2.getEventMap().values() ) { + for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events haplotypesToRemove.add(h); break; @@ -512,11 +531,10 @@ public class GenotypingEngine { // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) { - eventMap.remove(thisStart); - eventMap.remove(nextStart); - eventMap.put(mergedVC.getStart(), mergedVC); + if( h.getEventMap().containsKey(thisStart) && h.getEventMap().containsKey(nextStart) ) { + h.getEventMap().remove(thisStart); + h.getEventMap().remove(nextStart); + h.getEventMap().put(mergedVC.getStart(), mergedVC); } } startPosKeySet.add(mergedVC.getStart()); @@ -697,92 +715,9 @@ public class GenotypingEngine { return eventAllelesForSample; } - protected static Map generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { - final Map vcs = new LinkedHashMap(); - - int refPos = alignmentStartHapwrtRef; - if( refPos < 0 ) { return null; } // Protection against SW failures - int alignmentPos = 0; - - for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { - final CigarElement ce = cigar.getCigarElement(cigarIndex); - final int elementLength = ce.getLength(); - switch( ce.getOperator() ) { - case I: - { - if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig - final List insertionAlleles = new ArrayList(); - final int insertionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) ) { - insertionAlleles.add( Allele.create(refByte, true) ); - } - if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele - insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); - } else { - byte[] insertionBases = new byte[]{}; - insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base - insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); - if( BaseUtils.isAllRegularBases(insertionBases) ) { - insertionAlleles.add( Allele.create(insertionBases, false) ); - } - } - if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele - vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); - } - } - alignmentPos += elementLength; - break; - } - case S: - { - alignmentPos += elementLength; - break; - } - case D: - { - if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig - final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base - final List deletionAlleles = new ArrayList(); - final int deletionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { - deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(refByte, false) ); - vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); - } - } - refPos += elementLength; - break; - } - case M: - case EQ: - case X: - { - for( int iii = 0; iii < elementLength; iii++ ) { - final byte refByte = ref[refPos]; - final byte altByte = alignment[alignmentPos]; - if( refByte != altByte ) { // SNP! - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { - final List snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refByte, true ) ); - snpAlleles.add( Allele.create( altByte, false ) ); - vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); - } - } - refPos++; - alignmentPos++; - } - break; - } - case N: - case H: - case P: - default: - throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); - } - } - return vcs; + @Deprecated + protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + return new EventExtractor(haplotype, ref, refLoc, sourceNameToAdd); } protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index 03af9b59b..134863b8b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker { } // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2)); + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); if ( source1Map.size() == 0 || source2Map.size() == 0 ) { // TODO -- handle errors appropriately logger.debug("No source alleles; aborting at " + refContext.getLocus()); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 2be42337d..9fb75463a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -199,7 +199,8 @@ public class GenotypingEngineUnitTest extends BaseTest { public Map calcAlignment() { final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); - return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); + final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); + return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java new file mode 100644 index 000000000..c32cde641 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotype; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; + +import java.util.*; + +/** + * Extract simple VariantContext events from a single haplotype + * + * User: depristo + * Date: 3/27/13 + * Time: 8:35 AM + */ +public class EventExtractor extends TreeMap { + private final static Logger logger = Logger.getLogger(EventExtractor.class); + private final static boolean mergeClumpedEvents = true; + protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; + public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + + public EventExtractor( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + super(); + + processCigarForInitialEvents(haplotype, ref, refLoc, sourceNameToAdd); + if ( mergeClumpedEvents && getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { + replaceClumpedEventsWithBlockSubstititions(haplotype, ref, refLoc); + } + } + + /** + * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference + * @param stateForTesting + */ + protected EventExtractor(final Map stateForTesting) { + super(stateForTesting); + } + + /** + * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference + * @param stateForTesting + */ + protected EventExtractor(final Collection stateForTesting) { + for ( final VariantContext vc : stateForTesting ) + addVC(vc); + } + + protected void processCigarForInitialEvents(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { + final Cigar cigar = haplotype.getCigar(); + final byte[] alignment = haplotype.getBases(); + + int refPos = haplotype.getAlignmentStartHapwrtRef(); + if( refPos < 0 ) { + return; + } // Protection against SW failures + + int alignmentPos = 0; + + for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { + final CigarElement ce = cigar.getCigarElement(cigarIndex); + final int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case I: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final List insertionAlleles = new ArrayList(); + final int insertionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele + insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); + } else { + byte[] insertionBases = new byte[]{}; + insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base + insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength)); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); + } + } + alignmentPos += elementLength; + break; + } + case S: + { + alignmentPos += elementLength; + break; + } + case D: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base + final List deletionAlleles = new ArrayList(); + final int deletionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { + deletionAlleles.add( Allele.create(deletionBases, true) ); + deletionAlleles.add( Allele.create(refByte, false) ); + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } + } + refPos += elementLength; + break; + } + case M: + case EQ: + case X: + { + for( int iii = 0; iii < elementLength; iii++ ) { + final byte refByte = ref[refPos]; + final byte altByte = alignment[alignmentPos]; + if( refByte != altByte ) { // SNP! + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { + final List snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refByte, true ) ); + snpAlleles.add( Allele.create( altByte, false ) ); + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); + } + } + refPos++; + alignmentPos++; + } + break; + } + case N: + case H: + case P: + default: + throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); + } + } + } + + private void addVC(final VariantContext vc) { + addVC(vc, true); + } + + private void addVC(final VariantContext vc, final boolean merge) { + if ( containsKey(vc.getStart()) ) { + if ( merge ) { + final VariantContext prev = get(vc.getStart()); + put(vc.getStart(), makeBlock(prev, vc)); + } else { + throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc); + } + } else + put(vc.getStart(), vc); + } + + private VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { + if ( ! vc1.isSNP() ) throw new IllegalArgumentException("vc1 must be a snp"); + + Allele ref, alt; + final VariantContextBuilder b = new VariantContextBuilder(vc1); + if ( vc1.getReference().equals(vc2.getReference()) ) { + // we've got an insertion, so we just update the alt to have the prev alt + ref = vc1.getReference(); + alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); + } else { + // we're dealing with a deletion, so we patch the ref + ref = vc2.getReference(); + alt = vc1.getAlternateAllele(0); + b.stop(vc2.getEnd()); + } + + return b.alleles(Arrays.asList(ref, alt)).make(); + } + + // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it + @Requires("getNumberOfEvents() > 0") + protected void replaceClumpedEventsWithBlockSubstititions(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) { + int lastStart = -1; + for ( boolean foundOne = true; foundOne; ) { + foundOne = false; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() > lastStart ) { + lastStart = vc.getStart(); + final List neighborhood = getNeighborhood(vc, 10); + if ( updateToBlockSubstitutionIfBetter(neighborhood, haplotype, ref, refLoc) ) { + foundOne = true; + break; + } + } + } + } + } + + protected boolean updateToBlockSubstitutionIfBetter(final List neighbors, final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) { + if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) + return false; + // TODO -- need more tests to decide if this is really so good + + final VariantContext first = neighbors.get(0); + final int refStartOffset = first.getStart() - refLoc.getStart(); + final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart(); + + final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1); + final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar()); + + final VariantContextBuilder builder = new VariantContextBuilder(first); + builder.stop(first.getStart() + refBases.length - 1); + builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases))); + final VariantContext block = builder.make(); + + // remove all merged events + for ( final VariantContext merged : neighbors ) { + if ( remove(merged.getStart()) == null ) + throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged); + } + + // note must be after we remove the previous events as the treeset only allows one key per start + logger.info("Transforming into block substitution at " + block); + addVC(block, false); + + return true; + } + + /** + * Get all of the variant contexts starting at leftMost that are within maxBP of each other + * + * @param leftMost the left most (smallest position) variant context that will start the neighborhood + * @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next + * to be included the the resulting list + * @return a list that contains at least one element (leftMost) + */ + @Requires({"leftMost != null", "maxBPBetweenEvents >= 0"}) + @Ensures({"result != null", "! result.isEmpty()"}) + protected List getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) { + final List neighbors = new LinkedList(); + + VariantContext left = leftMost; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() < leftMost.getStart() ) + continue; + + if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) { + // this vc is within max distance to the end of the left event, so accumulate it + neighbors.add(vc); + left = vc; + } + } + + return neighbors; + } + + public Set getStartPositions() { + return keySet(); + } + + public Collection getVariantContexts() { + return values(); + } + + public int getNumberOfEvents() { + return size(); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("EventExtractor{"); + for ( final VariantContext vc : getVariantContexts() ) + b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles())); + b.append("}"); + return b.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 6dc223616..2e95fb03a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -43,7 +43,7 @@ import java.util.*; public class Haplotype extends Allele { private GenomeLoc genomeLocation = null; - private Map eventMap = null; + private EventExtractor eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; @@ -63,6 +63,12 @@ public class Haplotype extends Allele { this(bases, false); } + public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { + this(bases, isRef); + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + this.cigar = cigar; + } + /** * Copy constructor. Note the ref state of the provided allele is ignored! * @@ -92,11 +98,11 @@ public class Haplotype extends Allele { return Arrays.hashCode(getBases()); } - public Map getEventMap() { + public EventExtractor getEventMap() { return eventMap; } - public void setEventMap( final Map eventMap ) { + public void setEventMap( final EventExtractor eventMap ) { this.eventMap = eventMap; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 58f70d4b6..9b25b00c6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -48,6 +48,67 @@ public final class AlignmentUtils { // cannot be instantiated private AlignmentUtils() { } + /** + * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the + * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference + * + * refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference + * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X. + * + * @param bases + * @param refStart + * @param refEnd + * @param basesStartOnRef where does the bases array start w.r.t. the reference start? For example, bases[0] of + * could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at + * 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10. + * It's not trivial to eliminate this parameter because it's tied up with the cigar + * @param basesToRefCigar the cigar that maps the bases to the reference genome + * @return a non-null byte[] + */ + public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) { + if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd); + if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef); + if ( bases == null ) throw new IllegalArgumentException("Bases cannot be null"); + if ( basesToRefCigar == null ) throw new IllegalArgumentException("basesToRefCigar cannot be null"); + if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar); + + int refPos = basesStartOnRef; + int basesPos = 0; + + int basesStart = -1; + int basesStop = -1; + boolean done = false; + + for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) { + final CigarElement ce = basesToRefCigar.getCigarElement(iii); + final int bInc, rInc; + switch ( ce.getOperator() ) { + case I: bInc = 1; rInc = 0; break; + case M: case X: case EQ: bInc = rInc = 1; break; + case D: bInc = 0; rInc = 1; break; + default: + throw new IllegalStateException("Unsupported operator " + ce); + } + + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refStart ) + basesStart = basesPos; + if ( refPos == refEnd ) { + basesStop = basesPos; + done = true; + break; + } + refPos += rInc; + basesPos += bInc; + } + } + + if ( basesStart == -1 || basesStop == -1 ) + throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar); + + return Arrays.copyOfRange(bases, basesStart, basesStop + 1); + } + /** * Get the number of bases at which refSeq and readSeq differ, given their alignment * diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 0bd30c3a4..4565402b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -1424,4 +1424,21 @@ public class GATKVariantContextUtils { return result; } + + /** + * Are vc1 and 2 equal including their position and alleles? + * @param vc1 non-null VariantContext + * @param vc2 non-null VariantContext + * @return true if vc1 and vc2 are equal, false otherwise + */ + public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { + if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); + if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); + + if ( vc1.getStart() != vc2.getStart() ) return false; + if ( vc1.getEnd() != vc2.getEnd() ) return false; + if ( ! vc1.getChr().equals(vc2.getChr())) return false; + if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; + return true; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java new file mode 100644 index 000000000..480f82a46 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotype; + +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class EventExtractorUnitTest extends BaseTest { + private final static String CHR = "20"; + private final static String NAME = "foo"; + + @DataProvider(name = "MyDataProvider") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + final List SNP_ALLELES = Arrays.asList("A", "C"); + final List INS_ALLELES = Arrays.asList("A", "ACGTGA"); + final List DEL_ALLELES = Arrays.asList("ACGTA", "C"); + final List> allAlleles = Arrays.asList(SNP_ALLELES, INS_ALLELES, DEL_ALLELES); + for ( final int leftNotClump : Arrays.asList(-1, 3) ) { + for ( final int middleNotClump : Arrays.asList(-1, 10, 500) ) { + for ( final int rightNotClump : Arrays.asList(-1, 1000) ) { + for ( final int nClumped : Arrays.asList(3, 4) ) { + for ( final List> alleles : Utils.makePermutations(allAlleles, nClumped, true)) { + final List allVCS = new LinkedList(); + + if ( leftNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, leftNotClump, SNP_ALLELES)); + if ( middleNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, middleNotClump, SNP_ALLELES)); + if ( rightNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, rightNotClump, SNP_ALLELES)); + + int clumpStart = 50; + final List vcs = new LinkedList(); + for ( final List myAlleles : alleles ) { + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(NAME, CHR, clumpStart, myAlleles); + clumpStart = vc.getEnd() + 3; + vcs.add(vc); + } + + tests.add(new Object[]{new EventExtractor(new LinkedList(allVCS)), Collections.emptyList()}); + allVCS.addAll(vcs); + tests.add(new Object[]{new EventExtractor(allVCS), vcs}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProvider", enabled = true) // TODO == reenable + public void testGetNeighborhood(final EventExtractor eventExtractor, final List expectedNeighbors) { + final VariantContext leftOfNeighors = expectedNeighbors.isEmpty() ? null : expectedNeighbors.get(0); + + for ( final VariantContext vc : eventExtractor.getVariantContexts() ) { + final List n = eventExtractor.getNeighborhood(vc, 5); + if ( leftOfNeighors == vc ) + Assert.assertEquals(n, expectedNeighbors); + else if ( ! expectedNeighbors.contains(vc) ) + Assert.assertEquals(n, Collections.singletonList(vc), "Should only contain the original vc but " + n); + } + } + + @DataProvider(name = "BlockSubstitutionsData") + public Object[][] makeBlockSubstitutionsData() { + List tests = new ArrayList(); + + for ( int size = EventExtractor.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) { + final String ref = Utils.dupString("A", size); + final String alt = Utils.dupString("C", size); + tests.add(new Object[]{ref, alt, size + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList(ref, alt))}); + } + + tests.add(new Object[]{"AAAAAA", "GAGAGA", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAA", "GAGAG"))}); + tests.add(new Object[]{"AAAAAA", "GAGAGG", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAAA", "GAGAGG"))}); + + for ( int len = 0; len < 10; len++ ) { + final String s = len == 0 ? "" : Utils.dupString("A", len); + tests.add(new Object[]{s + "AACCCCAA", s + "GAAG", len + 2 + "M4D2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AACCCCAA", "GAAG"))}); + tests.add(new Object[]{s + "AAAA", s + "GACCCCAG", len + 2 + "M4I2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AAAA", "GACCCCAG"))}); + + tests.add(new Object[]{"AACCCCAA" + s, "GAAG" + s, "2M4D" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AACCCCAA", "GAAG"))}); + tests.add(new Object[]{"AAAA" + s, "GACCCCAG" + s, "2M4I" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAA", "GACCCCAG"))}); + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "BlockSubstitutionsData") + public void testBlockSubstitutionsData(final String refBases, final String haplotypeBases, final String cigar, final VariantContext expectedBlock) { + final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); + final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME); + Assert.assertEquals(ee.getNumberOfEvents(), 1); + final VariantContext actual = ee.getVariantContexts().iterator().next(); + Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual); + } + + @DataProvider(name = "AdjacentSNPIndelTest") + public Object[][] makeAdjacentSNPIndelTest() { + List tests = new ArrayList(); + + tests.add(new Object[]{"TT", "GCT", "1M1I1M", Arrays.asList(Arrays.asList("T", "GC"))}); + tests.add(new Object[]{"GCT", "TT", "1M1D", Arrays.asList(Arrays.asList("GC", "T"))}); + tests.add(new Object[]{"TT", "GCCT", "1M2I1M", Arrays.asList(Arrays.asList("T", "GCC"))}); + tests.add(new Object[]{"GCCT", "TT", "1M2D", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "AATT", "3M2D", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "GATT", "3M2D", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAAAA", "AGACA", "5M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("A", "C"))}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "AdjacentSNPIndelTest", enabled = true) + public void testAdjacentSNPIndelTest(final String refBases, final String haplotypeBases, final String cigar, final List> expectedAlleles) { + final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); + final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME); + Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size()); + final List actuals = new ArrayList(ee.getVariantContexts()); + for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) { + final VariantContext actual = actuals.get(i); + Assert.assertEquals(actual.getReference().getDisplayString(), expectedAlleles.get(i).get(0)); + Assert.assertEquals(actual.getAlternateAllele(0).getDisplayString(), expectedAlleles.get(i).get(1)); + } + } +}