System to merge multiple nearby alleles into block substitutions
-- Block substitution algorithm that merges nearby events based on distance. -- Also does some cleanup of GenotypingEngine
This commit is contained in:
parent
bff13bb5c5
commit
0310499b65
|
|
@ -432,7 +432,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( !activeAllelesToGenotype.isEmpty() ) {
|
||||
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.EventExtractor;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
|
|
@ -72,7 +73,6 @@ public class GenotypingEngine {
|
|||
private final boolean DEBUG;
|
||||
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
private final VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) {
|
||||
|
|
@ -145,99 +145,26 @@ public class GenotypingEngine {
|
|||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
// sanity check input arguments
|
||||
if (UG_engine == null)
|
||||
throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
|
||||
if (haplotypes == null || haplotypes.isEmpty())
|
||||
throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
||||
if (samples == null || samples.isEmpty())
|
||||
throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples);
|
||||
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty())
|
||||
throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
|
||||
if (ref == null || ref.length == 0 )
|
||||
throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
|
||||
if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length)
|
||||
throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||
if (activeRegionWindow == null )
|
||||
throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||
if (activeAllelesToGenotype == null )
|
||||
throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
||||
if (genomeLocParser == null )
|
||||
throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
|
||||
if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
|
||||
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
||||
if (samples == null || samples.isEmpty()) throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples);
|
||||
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
|
||||
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
|
||||
if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
||||
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
|
||||
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { logger.info("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
logger.info(h.toString());
|
||||
logger.info("> Cigar = " + h.getCigar());
|
||||
logger.info(">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
|
||||
}
|
||||
if( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
}
|
||||
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
|
||||
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
|
||||
// that carry events among the haplotypes
|
||||
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, samples, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final List<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
if( !in_GGA_mode ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final Map<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
priorityList.add(vc.getSource());
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
int compCount = 0;
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
if( compVC.getStart() == loc ) {
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
List<Allele> alleleSet = new ArrayList<Allele>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
|
||||
// check if this event is already in the list of events due to a repeat in the input alleles track
|
||||
final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make();
|
||||
boolean alreadyExists = false;
|
||||
for( final VariantContext eventToTest : eventsAtThisLoc ) {
|
||||
if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) {
|
||||
alreadyExists = true;
|
||||
}
|
||||
}
|
||||
if( !alreadyExists ) {
|
||||
priorityList.add(vcSourceName);
|
||||
eventsAtThisLoc.add(candidateEventToAdd);
|
||||
}
|
||||
alleleCount++;
|
||||
}
|
||||
}
|
||||
compCount++;
|
||||
}
|
||||
}
|
||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||
|
||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
|
|
@ -245,7 +172,7 @@ public class GenotypingEngine {
|
|||
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
|
||||
|
||||
// Sanity check the priority list for mistakes
|
||||
validatePriorityList( priorityList, eventsAtThisLoc );
|
||||
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
|
@ -264,7 +191,6 @@ public class GenotypingEngine {
|
|||
|
||||
if( DEBUG ) {
|
||||
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
|
|
@ -277,7 +203,6 @@ public class GenotypingEngine {
|
|||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
|
||||
|
||||
VariantContext annotatedCall = call;
|
||||
// TODO -- should be before annotated call, so that QDL works correctly
|
||||
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
}
|
||||
|
|
@ -295,6 +220,117 @@ public class GenotypingEngine {
|
|||
return new CalledHaplotypes(returnCalls, calledHaplotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Go through the haplotypes we assembled, and decompose them into their constituent variant contexts
|
||||
*
|
||||
* @param haplotypes the list of haplotypes we're working with
|
||||
* @param samples the samples we're working with
|
||||
* @param haplotypeReadMap map from samples -> the per read allele likelihoods
|
||||
* @param ref the reference bases (over the same interval as the haplotypes)
|
||||
* @param refLoc the span of the reference bases
|
||||
* @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode)
|
||||
* @return
|
||||
*/
|
||||
private TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
int hapNumber = 0;
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
|
||||
if( DEBUG ) logger.info("=== Best Haplotypes ===");
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( new EventExtractor( h, ref, refLoc, "HC" + hapNumber++ ) );
|
||||
if( ! in_GGA_mode ) {
|
||||
startPosKeySet.addAll(h.getEventMap().getStartPositions());
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
logger.info(h.toString());
|
||||
logger.info("> Cigar = " + h.getCigar());
|
||||
logger.info(">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if ( !in_GGA_mode && samples.size() >= 10 ) {
|
||||
// if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
|
||||
}
|
||||
|
||||
if ( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
}
|
||||
|
||||
return startPosKeySet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view
|
||||
* @param vcs a list of variant contexts
|
||||
* @return the list of the sources of vcs in the same order
|
||||
*/
|
||||
private List<String> makePriorityList(final List<VariantContext> vcs) {
|
||||
final List<String> priorityList = new LinkedList<String>();
|
||||
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
|
||||
|
||||
return priorityList;
|
||||
}
|
||||
|
||||
private List<VariantContext> getVCsAtThisLocation(final List<Haplotype> haplotypes,
|
||||
final int loc,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
// the overlapping events to merge into a common reference view
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final EventExtractor eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
int compCount = 0;
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
if( compVC.getStart() == loc ) {
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
List<Allele> alleleSet = new ArrayList<Allele>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
|
||||
// check if this event is already in the list of events due to a repeat in the input alleles track
|
||||
final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make();
|
||||
boolean alreadyExists = false;
|
||||
for( final VariantContext eventToTest : eventsAtThisLoc ) {
|
||||
if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) {
|
||||
alreadyExists = true;
|
||||
}
|
||||
}
|
||||
if( !alreadyExists ) {
|
||||
eventsAtThisLoc.add(candidateEventToAdd);
|
||||
}
|
||||
alleleCount++;
|
||||
}
|
||||
}
|
||||
compCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return eventsAtThisLoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele
|
||||
* @param samples List of samples to genotype
|
||||
|
|
@ -322,23 +358,6 @@ public class GenotypingEngine {
|
|||
return genotypes;
|
||||
}
|
||||
|
||||
private void validatePriorityList( final List<String> priorityList, final List<VariantContext> eventsAtThisLoc ) {
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if( !priorityList.contains(vc.getSource()) ) {
|
||||
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
for( final String name : priorityList ) {
|
||||
boolean found = false;
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if(vc.getSource().equals(name)) { found = true; break; }
|
||||
}
|
||||
if( !found ) {
|
||||
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, PerReadAlleleLikelihoodMap> filterToOnlyOverlappingReads( final GenomeLocParser parser,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
|
|
@ -382,10 +401,10 @@ public class GenotypingEngine {
|
|||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
for( final Haplotype h2 : haplotypes ) {
|
||||
for( final VariantContext vc2 : h2.getEventMap().values() ) {
|
||||
for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) {
|
||||
if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events
|
||||
haplotypesToRemove.add(h);
|
||||
break;
|
||||
|
|
@ -512,11 +531,10 @@ public class GenotypingEngine {
|
|||
|
||||
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final Map<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) {
|
||||
eventMap.remove(thisStart);
|
||||
eventMap.remove(nextStart);
|
||||
eventMap.put(mergedVC.getStart(), mergedVC);
|
||||
if( h.getEventMap().containsKey(thisStart) && h.getEventMap().containsKey(nextStart) ) {
|
||||
h.getEventMap().remove(thisStart);
|
||||
h.getEventMap().remove(nextStart);
|
||||
h.getEventMap().put(mergedVC.getStart(), mergedVC);
|
||||
}
|
||||
}
|
||||
startPosKeySet.add(mergedVC.getStart());
|
||||
|
|
@ -697,92 +715,9 @@ public class GenotypingEngine {
|
|||
return eventAllelesForSample;
|
||||
}
|
||||
|
||||
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) {
|
||||
final Map<Integer,VariantContext> vcs = new LinkedHashMap<Integer,VariantContext>();
|
||||
|
||||
int refPos = alignmentStartHapwrtRef;
|
||||
if( refPos < 0 ) { return null; } // Protection against SW failures
|
||||
int alignmentPos = 0;
|
||||
|
||||
for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
|
||||
final CigarElement ce = cigar.getCigarElement(cigarIndex);
|
||||
final int elementLength = ce.getLength();
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
{
|
||||
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
|
||||
final List<Allele> insertionAlleles = new ArrayList<Allele>();
|
||||
final int insertionStart = refLoc.getStart() + refPos - 1;
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) ) {
|
||||
insertionAlleles.add( Allele.create(refByte, true) );
|
||||
}
|
||||
if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele
|
||||
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
} else {
|
||||
byte[] insertionBases = new byte[]{};
|
||||
insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base
|
||||
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ));
|
||||
if( BaseUtils.isAllRegularBases(insertionBases) ) {
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
}
|
||||
}
|
||||
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
}
|
||||
}
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case S:
|
||||
{
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case D:
|
||||
{
|
||||
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
|
||||
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
|
||||
final List<Allele> deletionAlleles = new ArrayList<Allele>();
|
||||
final int deletionStart = refLoc.getStart() + refPos - 1;
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
|
||||
deletionAlleles.add( Allele.create(deletionBases, true) );
|
||||
deletionAlleles.add( Allele.create(refByte, false) );
|
||||
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
|
||||
}
|
||||
}
|
||||
refPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
{
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
final byte refByte = ref[refPos];
|
||||
final byte altByte = alignment[alignmentPos];
|
||||
if( refByte != altByte ) { // SNP!
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
|
||||
final List<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refByte, true ) );
|
||||
snpAlleles.add( Allele.create( altByte, false ) );
|
||||
vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
|
||||
}
|
||||
}
|
||||
refPos++;
|
||||
alignmentPos++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
|
||||
}
|
||||
}
|
||||
return vcs;
|
||||
@Deprecated
|
||||
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) {
|
||||
return new EventExtractor(haplotype, ref, refLoc, sourceNameToAdd);
|
||||
}
|
||||
|
||||
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
|
||||
|
|
|
|||
|
|
@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
// order results by start position
|
||||
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1));
|
||||
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2));
|
||||
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1));
|
||||
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2));
|
||||
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
|
||||
// TODO -- handle errors appropriately
|
||||
logger.debug("No source alleles; aborting at " + refContext.getLocus());
|
||||
|
|
|
|||
|
|
@ -199,7 +199,8 @@ public class GenotypingEngineUnitTest extends BaseTest {
|
|||
|
||||
public Map<Integer,VariantContext> calcAlignment() {
|
||||
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap);
|
||||
return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
|
||||
final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar());
|
||||
return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,307 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.haplotype;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Extract simple VariantContext events from a single haplotype
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/27/13
|
||||
* Time: 8:35 AM
|
||||
*/
|
||||
public class EventExtractor extends TreeMap<Integer, VariantContext> {
|
||||
private final static Logger logger = Logger.getLogger(EventExtractor.class);
|
||||
private final static boolean mergeClumpedEvents = true;
|
||||
protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3;
|
||||
public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
|
||||
public EventExtractor( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) {
|
||||
super();
|
||||
|
||||
processCigarForInitialEvents(haplotype, ref, refLoc, sourceNameToAdd);
|
||||
if ( mergeClumpedEvents && getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) {
|
||||
replaceClumpedEventsWithBlockSubstititions(haplotype, ref, refLoc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
|
||||
* @param stateForTesting
|
||||
*/
|
||||
protected EventExtractor(final Map<Integer, VariantContext> stateForTesting) {
|
||||
super(stateForTesting);
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing. Let's you set up a explicit configuration without having to process a haplotype and reference
|
||||
* @param stateForTesting
|
||||
*/
|
||||
protected EventExtractor(final Collection<VariantContext> stateForTesting) {
|
||||
for ( final VariantContext vc : stateForTesting )
|
||||
addVC(vc);
|
||||
}
|
||||
|
||||
protected void processCigarForInitialEvents(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) {
|
||||
final Cigar cigar = haplotype.getCigar();
|
||||
final byte[] alignment = haplotype.getBases();
|
||||
|
||||
int refPos = haplotype.getAlignmentStartHapwrtRef();
|
||||
if( refPos < 0 ) {
|
||||
return;
|
||||
} // Protection against SW failures
|
||||
|
||||
int alignmentPos = 0;
|
||||
|
||||
for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
|
||||
final CigarElement ce = cigar.getCigarElement(cigarIndex);
|
||||
final int elementLength = ce.getLength();
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
{
|
||||
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
|
||||
final List<Allele> insertionAlleles = new ArrayList<Allele>();
|
||||
final int insertionStart = refLoc.getStart() + refPos - 1;
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) ) {
|
||||
insertionAlleles.add( Allele.create(refByte, true) );
|
||||
}
|
||||
if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele
|
||||
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
} else {
|
||||
byte[] insertionBases = new byte[]{};
|
||||
insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base
|
||||
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength));
|
||||
if( BaseUtils.isAllRegularBases(insertionBases) ) {
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
}
|
||||
}
|
||||
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
|
||||
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
}
|
||||
}
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case S:
|
||||
{
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case D:
|
||||
{
|
||||
if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig
|
||||
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
|
||||
final List<Allele> deletionAlleles = new ArrayList<Allele>();
|
||||
final int deletionStart = refLoc.getStart() + refPos - 1;
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
|
||||
deletionAlleles.add( Allele.create(deletionBases, true) );
|
||||
deletionAlleles.add( Allele.create(refByte, false) );
|
||||
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
|
||||
}
|
||||
}
|
||||
refPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
{
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
final byte refByte = ref[refPos];
|
||||
final byte altByte = alignment[alignmentPos];
|
||||
if( refByte != altByte ) { // SNP!
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
|
||||
final List<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refByte, true ) );
|
||||
snpAlleles.add( Allele.create( altByte, false ) );
|
||||
addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
|
||||
}
|
||||
}
|
||||
refPos++;
|
||||
alignmentPos++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addVC(final VariantContext vc) {
|
||||
addVC(vc, true);
|
||||
}
|
||||
|
||||
private void addVC(final VariantContext vc, final boolean merge) {
|
||||
if ( containsKey(vc.getStart()) ) {
|
||||
if ( merge ) {
|
||||
final VariantContext prev = get(vc.getStart());
|
||||
put(vc.getStart(), makeBlock(prev, vc));
|
||||
} else {
|
||||
throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc);
|
||||
}
|
||||
} else
|
||||
put(vc.getStart(), vc);
|
||||
}
|
||||
|
||||
private VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) {
|
||||
if ( ! vc1.isSNP() ) throw new IllegalArgumentException("vc1 must be a snp");
|
||||
|
||||
Allele ref, alt;
|
||||
final VariantContextBuilder b = new VariantContextBuilder(vc1);
|
||||
if ( vc1.getReference().equals(vc2.getReference()) ) {
|
||||
// we've got an insertion, so we just update the alt to have the prev alt
|
||||
ref = vc1.getReference();
|
||||
alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false);
|
||||
} else {
|
||||
// we're dealing with a deletion, so we patch the ref
|
||||
ref = vc2.getReference();
|
||||
alt = vc1.getAlternateAllele(0);
|
||||
b.stop(vc2.getEnd());
|
||||
}
|
||||
|
||||
return b.alleles(Arrays.asList(ref, alt)).make();
|
||||
}
|
||||
|
||||
// TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it
|
||||
@Requires("getNumberOfEvents() > 0")
|
||||
protected void replaceClumpedEventsWithBlockSubstititions(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) {
|
||||
int lastStart = -1;
|
||||
for ( boolean foundOne = true; foundOne; ) {
|
||||
foundOne = false;
|
||||
for ( final VariantContext vc : getVariantContexts() ) {
|
||||
if ( vc.getStart() > lastStart ) {
|
||||
lastStart = vc.getStart();
|
||||
final List<VariantContext> neighborhood = getNeighborhood(vc, 10);
|
||||
if ( updateToBlockSubstitutionIfBetter(neighborhood, haplotype, ref, refLoc) ) {
|
||||
foundOne = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean updateToBlockSubstitutionIfBetter(final List<VariantContext> neighbors, final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) {
|
||||
if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION)
|
||||
return false;
|
||||
// TODO -- need more tests to decide if this is really so good
|
||||
|
||||
final VariantContext first = neighbors.get(0);
|
||||
final int refStartOffset = first.getStart() - refLoc.getStart();
|
||||
final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart();
|
||||
|
||||
final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1);
|
||||
final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar());
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(first);
|
||||
builder.stop(first.getStart() + refBases.length - 1);
|
||||
builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases)));
|
||||
final VariantContext block = builder.make();
|
||||
|
||||
// remove all merged events
|
||||
for ( final VariantContext merged : neighbors ) {
|
||||
if ( remove(merged.getStart()) == null )
|
||||
throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged);
|
||||
}
|
||||
|
||||
// note must be after we remove the previous events as the treeset only allows one key per start
|
||||
logger.info("Transforming into block substitution at " + block);
|
||||
addVC(block, false);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of the variant contexts starting at leftMost that are within maxBP of each other
|
||||
*
|
||||
* @param leftMost the left most (smallest position) variant context that will start the neighborhood
|
||||
* @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next
|
||||
* to be included the the resulting list
|
||||
* @return a list that contains at least one element (leftMost)
|
||||
*/
|
||||
@Requires({"leftMost != null", "maxBPBetweenEvents >= 0"})
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
protected List<VariantContext> getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) {
|
||||
final List<VariantContext> neighbors = new LinkedList<VariantContext>();
|
||||
|
||||
VariantContext left = leftMost;
|
||||
for ( final VariantContext vc : getVariantContexts() ) {
|
||||
if ( vc.getStart() < leftMost.getStart() )
|
||||
continue;
|
||||
|
||||
if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) {
|
||||
// this vc is within max distance to the end of the left event, so accumulate it
|
||||
neighbors.add(vc);
|
||||
left = vc;
|
||||
}
|
||||
}
|
||||
|
||||
return neighbors;
|
||||
}
|
||||
|
||||
public Set<Integer> getStartPositions() {
|
||||
return keySet();
|
||||
}
|
||||
|
||||
public Collection<VariantContext> getVariantContexts() {
|
||||
return values();
|
||||
}
|
||||
|
||||
public int getNumberOfEvents() {
|
||||
return size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder b = new StringBuilder("EventExtractor{");
|
||||
for ( final VariantContext vc : getVariantContexts() )
|
||||
b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles()));
|
||||
b.append("}");
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -43,7 +43,7 @@ import java.util.*;
|
|||
|
||||
public class Haplotype extends Allele {
|
||||
private GenomeLoc genomeLocation = null;
|
||||
private Map<Integer, VariantContext> eventMap = null;
|
||||
private EventExtractor eventMap = null;
|
||||
private Cigar cigar;
|
||||
private int alignmentStartHapwrtRef;
|
||||
private Event artificialEvent = null;
|
||||
|
|
@ -63,6 +63,12 @@ public class Haplotype extends Allele {
|
|||
this(bases, false);
|
||||
}
|
||||
|
||||
public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) {
|
||||
this(bases, isRef);
|
||||
this.alignmentStartHapwrtRef = alignmentStartHapwrtRef;
|
||||
this.cigar = cigar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor. Note the ref state of the provided allele is ignored!
|
||||
*
|
||||
|
|
@ -92,11 +98,11 @@ public class Haplotype extends Allele {
|
|||
return Arrays.hashCode(getBases());
|
||||
}
|
||||
|
||||
public Map<Integer, VariantContext> getEventMap() {
|
||||
public EventExtractor getEventMap() {
|
||||
return eventMap;
|
||||
}
|
||||
|
||||
public void setEventMap( final Map<Integer, VariantContext> eventMap ) {
|
||||
public void setEventMap( final EventExtractor eventMap ) {
|
||||
this.eventMap = eventMap;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -48,6 +48,67 @@ public final class AlignmentUtils {
|
|||
// cannot be instantiated
|
||||
private AlignmentUtils() { }
|
||||
|
||||
/**
|
||||
* Get the byte[] from bases that cover the reference interval refStart -> refEnd given the
|
||||
* alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference
|
||||
*
|
||||
* refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference
|
||||
* bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X.
|
||||
*
|
||||
* @param bases
|
||||
* @param refStart
|
||||
* @param refEnd
|
||||
* @param basesStartOnRef where does the bases array start w.r.t. the reference start? For example, bases[0] of
|
||||
* could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at
|
||||
* 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10.
|
||||
* It's not trivial to eliminate this parameter because it's tied up with the cigar
|
||||
* @param basesToRefCigar the cigar that maps the bases to the reference genome
|
||||
* @return a non-null byte[]
|
||||
*/
|
||||
public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) {
|
||||
if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd);
|
||||
if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef);
|
||||
if ( bases == null ) throw new IllegalArgumentException("Bases cannot be null");
|
||||
if ( basesToRefCigar == null ) throw new IllegalArgumentException("basesToRefCigar cannot be null");
|
||||
if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar);
|
||||
|
||||
int refPos = basesStartOnRef;
|
||||
int basesPos = 0;
|
||||
|
||||
int basesStart = -1;
|
||||
int basesStop = -1;
|
||||
boolean done = false;
|
||||
|
||||
for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) {
|
||||
final CigarElement ce = basesToRefCigar.getCigarElement(iii);
|
||||
final int bInc, rInc;
|
||||
switch ( ce.getOperator() ) {
|
||||
case I: bInc = 1; rInc = 0; break;
|
||||
case M: case X: case EQ: bInc = rInc = 1; break;
|
||||
case D: bInc = 0; rInc = 1; break;
|
||||
default:
|
||||
throw new IllegalStateException("Unsupported operator " + ce);
|
||||
}
|
||||
|
||||
for ( int i = 0; i < ce.getLength(); i++ ) {
|
||||
if ( refPos == refStart )
|
||||
basesStart = basesPos;
|
||||
if ( refPos == refEnd ) {
|
||||
basesStop = basesPos;
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
refPos += rInc;
|
||||
basesPos += bInc;
|
||||
}
|
||||
}
|
||||
|
||||
if ( basesStart == -1 || basesStop == -1 )
|
||||
throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar);
|
||||
|
||||
return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of bases at which refSeq and readSeq differ, given their alignment
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1424,4 +1424,21 @@ public class GATKVariantContextUtils {
|
|||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are vc1 and 2 equal including their position and alleles?
|
||||
* @param vc1 non-null VariantContext
|
||||
* @param vc2 non-null VariantContext
|
||||
* @return true if vc1 and vc2 are equal, false otherwise
|
||||
*/
|
||||
public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) {
|
||||
if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null");
|
||||
if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null");
|
||||
|
||||
if ( vc1.getStart() != vc2.getStart() ) return false;
|
||||
if ( vc1.getEnd() != vc2.getEnd() ) return false;
|
||||
if ( ! vc1.getChr().equals(vc2.getChr())) return false;
|
||||
if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,171 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.haplotype;
|
||||
|
||||
import net.sf.samtools.TextCigarCodec;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class EventExtractorUnitTest extends BaseTest {
|
||||
private final static String CHR = "20";
|
||||
private final static String NAME = "foo";
|
||||
|
||||
@DataProvider(name = "MyDataProvider")
|
||||
public Object[][] makeMyDataProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<String> SNP_ALLELES = Arrays.asList("A", "C");
|
||||
final List<String> INS_ALLELES = Arrays.asList("A", "ACGTGA");
|
||||
final List<String> DEL_ALLELES = Arrays.asList("ACGTA", "C");
|
||||
final List<List<String>> allAlleles = Arrays.asList(SNP_ALLELES, INS_ALLELES, DEL_ALLELES);
|
||||
for ( final int leftNotClump : Arrays.asList(-1, 3) ) {
|
||||
for ( final int middleNotClump : Arrays.asList(-1, 10, 500) ) {
|
||||
for ( final int rightNotClump : Arrays.asList(-1, 1000) ) {
|
||||
for ( final int nClumped : Arrays.asList(3, 4) ) {
|
||||
for ( final List<List<String>> alleles : Utils.makePermutations(allAlleles, nClumped, true)) {
|
||||
final List<VariantContext> allVCS = new LinkedList<VariantContext>();
|
||||
|
||||
if ( leftNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, leftNotClump, SNP_ALLELES));
|
||||
if ( middleNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, middleNotClump, SNP_ALLELES));
|
||||
if ( rightNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, rightNotClump, SNP_ALLELES));
|
||||
|
||||
int clumpStart = 50;
|
||||
final List<VariantContext> vcs = new LinkedList<VariantContext>();
|
||||
for ( final List<String> myAlleles : alleles ) {
|
||||
final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(NAME, CHR, clumpStart, myAlleles);
|
||||
clumpStart = vc.getEnd() + 3;
|
||||
vcs.add(vc);
|
||||
}
|
||||
|
||||
tests.add(new Object[]{new EventExtractor(new LinkedList<VariantContext>(allVCS)), Collections.emptyList()});
|
||||
allVCS.addAll(vcs);
|
||||
tests.add(new Object[]{new EventExtractor(allVCS), vcs});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Example testng test using MyDataProvider
|
||||
*/
|
||||
@Test(dataProvider = "MyDataProvider", enabled = true) // TODO == reenable
|
||||
public void testGetNeighborhood(final EventExtractor eventExtractor, final List<VariantContext> expectedNeighbors) {
|
||||
final VariantContext leftOfNeighors = expectedNeighbors.isEmpty() ? null : expectedNeighbors.get(0);
|
||||
|
||||
for ( final VariantContext vc : eventExtractor.getVariantContexts() ) {
|
||||
final List<VariantContext> n = eventExtractor.getNeighborhood(vc, 5);
|
||||
if ( leftOfNeighors == vc )
|
||||
Assert.assertEquals(n, expectedNeighbors);
|
||||
else if ( ! expectedNeighbors.contains(vc) )
|
||||
Assert.assertEquals(n, Collections.singletonList(vc), "Should only contain the original vc but " + n);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BlockSubstitutionsData")
|
||||
public Object[][] makeBlockSubstitutionsData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( int size = EventExtractor.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) {
|
||||
final String ref = Utils.dupString("A", size);
|
||||
final String alt = Utils.dupString("C", size);
|
||||
tests.add(new Object[]{ref, alt, size + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList(ref, alt))});
|
||||
}
|
||||
|
||||
tests.add(new Object[]{"AAAAAA", "GAGAGA", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAA", "GAGAG"))});
|
||||
tests.add(new Object[]{"AAAAAA", "GAGAGG", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAAA", "GAGAGG"))});
|
||||
|
||||
for ( int len = 0; len < 10; len++ ) {
|
||||
final String s = len == 0 ? "" : Utils.dupString("A", len);
|
||||
tests.add(new Object[]{s + "AACCCCAA", s + "GAAG", len + 2 + "M4D2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AACCCCAA", "GAAG"))});
|
||||
tests.add(new Object[]{s + "AAAA", s + "GACCCCAG", len + 2 + "M4I2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AAAA", "GACCCCAG"))});
|
||||
|
||||
tests.add(new Object[]{"AACCCCAA" + s, "GAAG" + s, "2M4D" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AACCCCAA", "GAAG"))});
|
||||
tests.add(new Object[]{"AAAA" + s, "GACCCCAG" + s, "2M4I" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAA", "GACCCCAG"))});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Example testng test using MyDataProvider
|
||||
*/
|
||||
@Test(dataProvider = "BlockSubstitutionsData")
|
||||
public void testBlockSubstitutionsData(final String refBases, final String haplotypeBases, final String cigar, final VariantContext expectedBlock) {
|
||||
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
|
||||
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
|
||||
final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME);
|
||||
Assert.assertEquals(ee.getNumberOfEvents(), 1);
|
||||
final VariantContext actual = ee.getVariantContexts().iterator().next();
|
||||
Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual);
|
||||
}
|
||||
|
||||
@DataProvider(name = "AdjacentSNPIndelTest")
|
||||
public Object[][] makeAdjacentSNPIndelTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{"TT", "GCT", "1M1I1M", Arrays.asList(Arrays.asList("T", "GC"))});
|
||||
tests.add(new Object[]{"GCT", "TT", "1M1D", Arrays.asList(Arrays.asList("GC", "T"))});
|
||||
tests.add(new Object[]{"TT", "GCCT", "1M2I1M", Arrays.asList(Arrays.asList("T", "GCC"))});
|
||||
tests.add(new Object[]{"GCCT", "TT", "1M2D", Arrays.asList(Arrays.asList("GCC", "T"))});
|
||||
tests.add(new Object[]{"AAGCCT", "AATT", "3M2D", Arrays.asList(Arrays.asList("GCC", "T"))});
|
||||
tests.add(new Object[]{"AAGCCT", "GATT", "3M2D", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))});
|
||||
tests.add(new Object[]{"AAAAA", "AGACA", "5M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("A", "C"))});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Example testng test using MyDataProvider
|
||||
*/
|
||||
@Test(dataProvider = "AdjacentSNPIndelTest", enabled = true)
|
||||
public void testAdjacentSNPIndelTest(final String refBases, final String haplotypeBases, final String cigar, final List<List<String>> expectedAlleles) {
|
||||
final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar));
|
||||
final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length());
|
||||
final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME);
|
||||
Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size());
|
||||
final List<VariantContext> actuals = new ArrayList<VariantContext>(ee.getVariantContexts());
|
||||
for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) {
|
||||
final VariantContext actual = actuals.get(i);
|
||||
Assert.assertEquals(actual.getReference().getDisplayString(), expectedAlleles.get(i).get(0));
|
||||
Assert.assertEquals(actual.getAlternateAllele(0).getDisplayString(), expectedAlleles.get(i).get(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue