Merging latest CMI updates with UNSTABLE

This commit is contained in:
Mauricio Carneiro 2012-11-27 09:08:00 -05:00
commit 97fd5de260
46 changed files with 1833 additions and 424 deletions

View File

@ -31,7 +31,6 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -42,163 +41,26 @@ import java.util.*;
public class GenotypingEngine {
private final boolean DEBUG;
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
public GenotypingEngine( final boolean DEBUG ) {
this.DEBUG = DEBUG;
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
noCall.add(Allele.NO_CALL);
}
// WARN
// This function is the streamlined approach, currently not being used by default
// WARN
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
// WARN
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
final ArrayList<Haplotype> haplotypes,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser ) {
// Prepare the list of haplotype indices to genotype
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
for( final Haplotype h : haplotypes ) {
allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
}
final int numHaplotypes = haplotypes.size();
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
}
}
genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
}
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
// Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
haplotypesToRemove.add(h);
}
}
haplotypes.removeAll(haplotypesToRemove);
if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
// set up the default 1-to-1 haplotype mapping object
final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
list.add(h);
haplotypeMapping.put(call.getAllele(h.getBases()), list);
}
returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
return returnVCs;
}
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
int count = 0;
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
for( final Haplotype h : haplotypes ) {
if( DEBUG ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
}
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
startPosKeySet.addAll(h.getEventMap().keySet());
}
// Create the VC merge priority list
final ArrayList<String> priorityList = new ArrayList<String>();
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
priorityList.add("HC" + iii);
}
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc);
}
}
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
// Merge the event to find a common reference representation
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
}
if( DEBUG ) {
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
}
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final int myNumHaplotypes = alleleMapper.size();
final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
int glIndex = 0;
for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
}
}
// using the allele mapping object translate the haplotype allele into the event allele
final Genotype g = new GenotypeBuilder(sample)
.alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
.phased(loc != startPosKeySet.first())
.PL(genotypeLikelihoods).make();
myGenotypes.add(g);
}
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
}
}
return returnCalls;
}
// BUGBUG: Create a class to hold this complicated return type
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
final ArrayList<Haplotype> haplotypes,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final ArrayList<VariantContext> activeAllelesToGenotype ) {
public List<Pair<VariantContext, Map<Allele, List<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
final List<Haplotype> haplotypes,
final byte[] ref,
final GenomeLoc refLoc,
final GenomeLoc activeRegionWindow,
final GenomeLocParser genomeLocParser,
final List<VariantContext> activeAllelesToGenotype ) {
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
@ -207,7 +69,7 @@ public class GenotypingEngine {
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( DEBUG ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
@ -217,10 +79,10 @@ public class GenotypingEngine {
}
cleanUpSymbolicUnassembledEvents( haplotypes );
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
}
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
if( in_GGA_mode ) {
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}
@ -232,7 +94,7 @@ public class GenotypingEngine {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
if( activeAllelesToGenotype.isEmpty() ) {
if( !in_GGA_mode ) {
for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
@ -261,7 +123,14 @@ public class GenotypingEngine {
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
final ArrayList<Allele> alleleOrdering = new ArrayList<Allele>(alleleMapper.size());
alleleOrdering.add(refAllele);
for( final VariantContext vc : eventsAtThisLoc ) {
alleleOrdering.add(vc.getAlternateAllele(0));
}
// Sanity check the priority list
for( final VariantContext vc : eventsAtThisLoc ) {
@ -283,11 +152,15 @@ public class GenotypingEngine {
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
// let's update the Allele keys in the mapper because they can change after merging when there are complex events
Map<Allele, List<Haplotype>> updatedAlleleMapper = new HashMap<Allele, List<Haplotype>>(alleleMapper.size());
for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) {
final Allele oldAllele = alleleOrdering.get(i);
final Allele newAllele = mergedVC.getAlleles().get(i);
updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele));
alleleOrdering.set(i, newAllele);
}
alleleMapper = updatedAlleleMapper;
if( DEBUG ) {
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
@ -299,7 +172,7 @@ public class GenotypingEngine {
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
final int numHaplotypes = alleleMapper.size();
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
@ -313,23 +186,23 @@ public class GenotypingEngine {
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
// also, need to update the allele -> haplotype mapping
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
final HashMap<Allele, List<Haplotype>> alleleHashMapTrim = new HashMap<Allele, List<Haplotype>>();
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii)));
}
call = vcCallTrim;
alleleHashMap = alleleHashMapTrim;
alleleMapper = alleleHashMapTrim;
}
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
returnCalls.add( new Pair<VariantContext, Map<Allele,List<Haplotype>>>(call, alleleMapper) );
}
}
}
return returnCalls;
}
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().values() ) {
@ -348,7 +221,7 @@ public class GenotypingEngine {
haplotypes.removeAll(haplotypesToRemove);
}
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
final int MAX_SIZE_TO_COMBINE = 15;
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
if( startPosKeySet.size() <= 1 ) { return; }
@ -395,7 +268,9 @@ public class GenotypingEngine {
final ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
haplotypeList.add(h);
for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0];
final HashSet<String> sampleSet = new HashSet<String>(1);
sampleSet.add(sample);
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0];
if( thisHapVC == null ) {
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
@ -489,37 +364,87 @@ public class GenotypingEngine {
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>(eventsAtThisLoc.size()+1);
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
alleleMapper.put(refAllele, new ArrayList<Haplotype>());
for( final VariantContext vc : eventsAtThisLoc )
alleleMapper.put(vc.getAlternateAllele(0), new ArrayList<Haplotype>());
final ArrayList<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
for( final Haplotype h : haplotypes ) {
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
refList.add(h);
alleleMapper.get(refAllele).add(h);
} else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) {
alleleMapper.get(h.getArtificialAllele()).add(h);
} else {
boolean foundInEventList = false;
boolean haplotypeIsDetermined = false;
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
foundInEventList = true;
alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h);
haplotypeIsDetermined = true;
break;
}
}
if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
refList.add(h);
}
if( !haplotypeIsDetermined )
undeterminedHaplotypes.add(h);
}
}
alleleMapper.add(refList);
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
for( final Haplotype h : haplotypes ) {
if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
list.add(h);
for( final Haplotype h : undeterminedHaplotypes ) {
Allele matchingAllele = null;
for( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
// don't test against the reference allele
if( alleleToTest.getKey().equals(refAllele) )
continue;
final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
matchingAllele = alleleToTest.getKey();
break;
}
}
alleleMapper.add(list);
if( matchingAllele == null )
matchingAllele = refAllele;
alleleMapper.get(matchingAllele).add(h);
}
return alleleMapper;
}
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
if ( fromSuperset == null )
return false;
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
if ( resolveSupersetToSubset )
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
return false;
}
return true;
}
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
if ( targetReference.length() <= actualReference.length() )
return currentAlleles;
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
for ( final Allele a : currentAlleles ) {
newAlleles.add(Allele.extend(a, extraBases));
}
return newAlleles;
}
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }

View File

@ -131,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 1;
@Advanced
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
@Advanced
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
@Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10;
@ -248,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.CONTAMINATION_FRACTION = 0.0;
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
@ -273,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
VCFConstants.GENOTYPE_QUALITY_KEY,
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);
// header lines for the experimental HaplotypeCaller-specific annotations
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
@ -298,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
genotypingEngine = new GenotypingEngine( DEBUG );
}
//---------------------------------------------------------------------------------------------------------------
@ -421,52 +405,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
for( final Pair<VariantContext, Map<Allele, List<Haplotype>>> callResult :
genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) {
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
if( !GENOTYPE_FULL_ACTIVE_REGION ) {
// add some custom annotations to the calls
// Calculate the number of variants on the haplotype
int maxNumVar = 0;
for( final Allele allele : callResult.getFirst().getAlleles() ) {
if( !allele.isReference() ) {
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
final int numVar = haplotype.getEventMap().size();
if( numVar > maxNumVar ) { maxNumVar = numVar; }
}
}
}
// Calculate the event length
int maxLength = 0;
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
final int length = a.length() - annotatedCall.getReference().length();
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
}
myAttributes.put("NVH", maxNumVar);
myAttributes.put("NumHapEval", bestHaplotypes.size());
myAttributes.put("NumHapAssembly", haplotypes.size());
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
myAttributes.put("EVENTLENGTH", maxLength);
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
myAttributes.put("extType", annotatedCall.getType().toString() );
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
//}
if( annotatedCall.hasAttribute("QD") ) {
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
}
}
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
}
@ -522,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
// protect against INTERVALS with abnormally high coverage
// BUGBUG: remove when positinal downsampler is hooked up to ART/HC
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
activeRegion.add(clippedRead);
}

View File

@ -148,45 +148,31 @@ public class LikelihoodCalculationEngine {
return Math.min(b1.length, b2.length);
}
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
list.add(h);
haplotypeMapping.add(list);
}
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
}
// This function takes just a single sample and a haplotypeMapping
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
final TreeSet<String> sampleSet = new TreeSet<String>();
sampleSet.add(sample);
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping);
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering);
}
// This function takes a set of samples to pool over and a haplotypeMapping
@Requires({"haplotypeMapping.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
final int numHaplotypes = haplotypeMapping.size();
final int numHaplotypes = alleleOrdering.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
for( int iii = 0; iii < numHaplotypes; iii++ ) {
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
}
// compute the diploid haplotype likelihoods
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {
for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) {
double haplotypeLikelihood = 0.0;
for( final String sample : samples ) {
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
@ -200,12 +186,48 @@ public class LikelihoodCalculationEngine {
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
}
}
}
}
}
// normalize the diploid likelihoods matrix
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
}
// This function takes a set of samples to pool over and a haplotypeMapping
@Requires({"haplotypeList.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
final int numHaplotypes = haplotypeList.size();
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
for( int iii = 0; iii < numHaplotypes; iii++ ) {
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
}
// compute the diploid haplotype likelihoods
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
for( int iii = 0; iii < numHaplotypes; iii++ ) {
final Haplotype iii_haplotype = haplotypeList.get(iii);
for( int jjj = 0; jjj <= iii; jjj++ ) {
final Haplotype jjj_haplotype = haplotypeList.get(jjj);
double haplotypeLikelihood = 0.0;
for( final String sample : samples ) {
final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample);
final int[] readCounts_iii = iii_haplotype.getReadCounts(sample);
final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample);
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
}
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
}
}
// normalize the diploid likelihoods matrix
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
}
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
@ -296,14 +318,7 @@ public class LikelihoodCalculationEngine {
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
// set up the default 1-to-1 haplotype mapping object
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
list.add(h);
haplotypeMapping.add(list);
}
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together
int hap1 = 0;
int hap2 = 0;
@ -347,7 +362,7 @@ public class LikelihoodCalculationEngine {
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
final Pair<VariantContext, Map<Allele,List<Haplotype>>> call,
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();

View File

@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
// for GGA mode, add the desired allele into the haplotype
for( final VariantContext compVC : activeAllelesToGenotype ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
return returnHaplotypes;
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
// for GGA mode, add the desired allele into the haplotype if it isn't already present
if( !activeAllelesToGenotype.isEmpty() ) {
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
// This if statement used to additionally have:
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
// a haplotype that already contains a 1bp insertion (so practically it is reference but
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
if( vcOnHaplotype == null ) {
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
}
}
}
@ -369,6 +379,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
if ( haplotype.isArtificialHaplotype() )
h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition());
h.leftBreakPoint = leftBreakPoint;
h.rightBreakPoint = rightBreakPoint;
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures

View File

@ -51,16 +51,16 @@ public class BQSRIntegrationTest extends WalkerTest {
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
String HiSeqInterval = "chr1:10,000,000-10,100,000";
return new Object[][]{
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "4fd3c9ad97e6ac58cba644a76564c9f7")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "2620f734cce20f70ce13afd880e46e5c")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "5eb3b94e767da19a4c037ee132e4b19a")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab261d291b107a3da7897759c0e4fa89")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "292303f649fbb19dc05d4a0197a49eeb")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "8ced9d1094493f17fb1876b818a64541")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "abb838131e403d39820dbd66932d1ed0")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "8f62aa0e75770204c98d8299793cc53c")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},

View File

@ -21,17 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3");
HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f");
HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f");
}
// TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"541aa8291f03ba33bd1ad3d731fd5657");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -42,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -53,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -64,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762");
}
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -91,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("425f1a0fb00d7145edf1c55e54346fae"));
Arrays.asList("40bf739fb2b1743642498efe79ea6342"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
}

View File

@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
haplotypes.add(haplotype);
}
}
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample");
final HashSet<String> sampleSet = new HashSet<String>(1);
sampleSet.add("myTestSample");
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes);
}
}

View File

@ -118,17 +118,24 @@ public class CommandLineGATK extends CommandLineExecutable {
public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
private static void checkForMaskedUserErrors(final Throwable t) {
// masked out of memory error
if ( t instanceof OutOfMemoryError )
exitSystemWithUserError(new UserException.NotEnoughMemory());
// masked user error
if ( t instanceof UserException || t instanceof TribbleException )
exitSystemWithUserError(new UserException(t.getMessage()));
// no message means no masked error
final String message = t.getMessage();
if ( message == null )
return;
// we know what to do about the common "Too many open files" error
// too many open files error
if ( message.contains("Too many open files") )
exitSystemWithUserError(new UserException.TooManyOpenFiles());
// malformed BAM looks like a SAM file
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
exitSystemWithSamError(t);
// can't close tribble index when writing
@ -138,12 +145,10 @@ public class CommandLineGATK extends CommandLineExecutable {
// disk is full
if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
// masked out of memory error
if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
exitSystemWithUserError(new UserException.NotEnoughMemory());
// masked error wrapped in another one
if ( t.getCause() != null )
checkForMaskedUserErrors(t.getCause());
}
/**

View File

@ -271,7 +271,18 @@ public class GATKReport {
* @return a simplified GATK report
*/
public static GATKReport newSimpleReport(final String tableName, final String... columns) {
GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.length);
return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns);
}
/**
* @see #newSimpleReport(String, String...) but with a customized description
* @param tableName
* @param desc
* @param columns
* @return
*/
public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) {
GATKReportTable table = new GATKReportTable(tableName, desc, columns.length);
for (String column : columns) {
table.addColumn(column, "");

View File

@ -80,6 +80,9 @@ public enum GATKReportVersion {
* @return The version as an enum.
*/
public static GATKReportVersion fromHeader(String header) {
if ( header == null )
throw new UserException.BadInput("The GATK report has no version specified in the header");
if (header.startsWith("##:GATKReport.v0.1 "))
return GATKReportVersion.V0_1;

View File

@ -80,7 +80,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
// skip this location -- it's not part of our engine intervals
// TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow
if ( outsideEngineIntervals(location) )
continue;

View File

@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
final List<Double> refQuals, final List<Double> altQuals) {
if (pileup != null && likelihoodMap == null) {
// no per-read likelihoods available:
// old UG snp-only path through the annotations
for ( final PileupElement p : pileup ) {
if ( isUsableBase(p) ) {
if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
}
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
// BUGBUG: There needs to be a comparable isUsableBase check here
if (a.isNoCall())
continue; // read is non-informative
if (a.isReference())
refQuals.add((double)el.getKey().getMappingQuality());
else if (allAlleles.contains(a))
altQuals.add((double)el.getKey().getMappingQuality());
}
}

View File

@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
ReadBackedPileup pileup = null;
if (stratifiedContexts != null) {
if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context != null )
pileup = context.getBasePileup();

View File

@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
final List<Double> refQuals, final List<Double> altQuals) {
if (alleleLikelihoodMap == null) {
// use fast SNP-based version if we don't have per-read allele likelihoods
// use old UG SNP-based version if we don't have per-read allele likelihoods
for ( final PileupElement p : pileup ) {
if ( isUsableBase(p) ) {
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);

View File

@ -82,7 +82,7 @@ import java.util.*;
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@Reference(window=@Window(start=-50,stop=50))
@By(DataSource.REFERENCE)
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible {
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible, TreeReducible<Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
return true;
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
/**
* We want reads that span deletions
*
@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
return 1;
}
/**
* Increment the number of loci processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
@Override
public Integer reduceInit() { return 0; }
@Override
public Integer reduce(Integer value, Integer sum) { return value + sum; }
@Override
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
/**

View File

@ -277,8 +277,12 @@ public class VariantAnnotatorEngine {
if ( expression.fieldName.equals("ID") ) {
if ( vc.hasID() )
infoAnnotations.put(expression.fullName, vc.getID());
} else if (expression.fieldName.equals("ALT")) {
infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString());
} else if ( vc.hasAttribute(expression.fieldName) ) {
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
}
}
}

View File

@ -227,7 +227,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
*/
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead);
final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
RecalUtils.parsePlatformForRead(read, RAC);
@ -268,16 +268,25 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
}
protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
final int BUFFER_SIZE = 0;
final int readLength = read.getReadBases().length;
final boolean[] knownSites = new boolean[readLength];
Arrays.fill(knownSites, false);
for( final Feature f : features ) {
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; }
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureStartOnRead = 0;
}
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; }
Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true);
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureEndOnRead = readLength;
}
if( featureStartOnRead > readLength ) {
featureStartOnRead = featureEndOnRead = readLength;
}
Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
}
return knownSites;
}

View File

@ -102,13 +102,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Hidden
@Advanced
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
public boolean RUN_WITHOUT_DBSNP = false;
@ -139,6 +136,13 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
public int INDELS_CONTEXT_SIZE = 3;
/**
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
* This argument is ignored if the Cycle covariate is not used.
*/
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
public int MAXIMUM_CYCLE_VALUE = 500;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
*/
@ -176,9 +180,15 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
public String BINARY_TAG_NAME = null;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;

View File

@ -6,11 +6,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
@ -20,6 +19,21 @@ import java.util.Map;
*/
public class CoverageUtils {
public enum CountPileupType {
/**
* Count all reads independently (even if from the same fragment).
*/
COUNT_READS,
/**
* Count all fragments (even if the reads that compose the fragment are not consistent at that base).
*/
COUNT_FRAGMENTS,
/**
* Count all fragments (but only if the reads that compose the fragment are consistent at that base).
*/
COUNT_FRAGMENTS_REQUIRE_SAME_BASE
}
/**
* Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context
* as an array of ints, indexed by the index fields of BaseUtils
@ -64,10 +78,10 @@ public class CoverageUtils {
}
public static Map<DoCOutputType.Partition,Map<String,int[]>>
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection<DoCOutputType.Partition> types) {
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection<DoCOutputType.Partition> types) {
Map<DoCOutputType.Partition,Map<String,int[]>> countsByIDByType = new HashMap<DoCOutputType.Partition,Map<String,int[]>>();
Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ);
Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType);
for (DoCOutputType.Partition t : types ) {
// iterate through the read group counts and build the type associations
for ( Map.Entry<SAMReadGroupRecord,int[]> readGroupCountEntry : countsByRG.entrySet() ) {
@ -95,31 +109,95 @@ public class CoverageUtils {
}
}
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) {
Map<SAMReadGroupRecord, int[]> countsByRG = new HashMap<SAMReadGroupRecord,int[]>();
for ( PileupElement e : context.getBasePileup() ) {
if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) {
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
if ( ! countsByRG.keySet().contains(readGroup) ) {
countsByRG.put(readGroup,new int[6]);
updateCounts(countsByRG.get(readGroup),e);
} else {
updateCounts(countsByRG.get(readGroup),e);
List<PileupElement> countPileup = new LinkedList<PileupElement>();
FragmentCollection<PileupElement> fpile;
switch (countType) {
case COUNT_READS:
for (PileupElement e : context.getBasePileup())
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
countPileup.add(e);
break;
case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters:
fpile = context.getBasePileup().getStartSortedPileup().toFragments();
for (PileupElement e : fpile.getSingletonReads())
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
countPileup.add(e);
for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
// iterate over all elements in fragment:
for (PileupElement e : overlappingPair) {
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) {
countPileup.add(e); // add the first passing element per fragment
break;
}
}
}
}
break;
case COUNT_FRAGMENTS_REQUIRE_SAME_BASE:
fpile = context.getBasePileup().getStartSortedPileup().toFragments();
for (PileupElement e : fpile.getSingletonReads())
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
countPileup.add(e);
for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
PileupElement firstElem = null;
PileupElement addElem = null;
// iterate over all elements in fragment:
for (PileupElement e : overlappingPair) {
if (firstElem == null)
firstElem = e;
else if (e.getBase() != firstElem.getBase()) {
addElem = null;
break;
}
// will add the first passing element per base-consistent fragment:
if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
addElem = e;
}
if (addElem != null)
countPileup.add(addElem);
}
break;
default:
throw new UserException("Must use valid CountPileupType");
}
for (PileupElement e : countPileup) {
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
if (!countsByRG.keySet().contains(readGroup))
countsByRG.put(readGroup, new int[6]);
updateCounts(countsByRG.get(readGroup), e);
}
return countsByRG;
}
private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ));
}
private static void updateCounts(int[] counts, PileupElement e) {
if ( e.isDeletion() ) {
counts[BaseUtils.DELETION_INDEX]++;
counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount();
} else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) {
counts[BaseUtils.NO_CALL_INDEX]++;
counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount();
} else {
try {
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount();
} catch (ArrayIndexOutOfBoundsException exc) {
throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase());
}

View File

@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
int minMappingQuality = -1;
@Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
int maxMappingQuality = Integer.MAX_VALUE;
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false)
byte minBaseQuality = -1;
@Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
byte maxBaseQuality = Byte.MAX_VALUE;
@Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false)
CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS;
/**
* Instead of reporting depth, report the base pileup at each locus
*/
@ -373,7 +377,7 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
//System.out.printf("\t[log]\t%s",ref.getLocus());
}
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes);
} else {
return null;
}

View File

@ -47,6 +47,12 @@ import java.util.List;
* <p>
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
*
* The output format can be partially controlled using the provided command-line arguments.
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
* separate fasta sequence (named numerically in order).
*
* Several important notes:
* 1) if there are multiple variants that start at a site, it chooses one of them randomly.
* 2) when there are overlapping indels (but with different start positions) only the first will be chosen.

View File

@ -190,6 +190,10 @@ public class UnifiedGenotyperEngine {
final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
if ( vc != null )
results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set
// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
// results.add(generateEmptyContext(tracker, refContext, null, rawContext));
}
}
}

View File

@ -183,6 +183,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
@Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
private boolean keepSitesWithAC0 = false;
@Hidden
@Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
private int numSamplesFromArgument = 0;
/**
* If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
* variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
@ -589,6 +593,14 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
public int getNumberOfSamplesForEvaluation() {
if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty())
return sampleNamesForEvaluation.size();
else {
return numSamplesFromArgument;
}
}
public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
public List<RodBinding<VariantContext>> getComps() { return comps; }

View File

@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier {
// There are ploidy x n sample chromosomes
// TODO -- generalize to handle multiple ploidy
nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy();
nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy();
if ( nchrom < 2 )
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample");

View File

@ -151,14 +151,6 @@ import java.util.*;
* -mvq 50 \
* -o violations.vcf
*
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -number 1000
*
* Creating a set with 50% of the total number of variants in the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \

View File

@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() ));
}
/**
* Tests whether this genome loc starts at the same position as that.
*
* i.e., do this and that have the same contig and the same start position
*
* @param that genome loc to compare to
* @return true if this and that have the same contig and the same start position
*/
@Requires("that != null")
public final boolean startsAt( GenomeLoc that ) {
int comparison = this.compareContigs(that);
return comparison == 0 && this.getStart() == that.getStart();
}
/**
* Tests whether any portion of this contig is before that contig.
* @param that Other contig to test.

View File

@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
// our private storage for the GenomeLoc's
private List<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
// cache this to make overlap checking much more efficient
private int previousOverlapSearchIndex = -1;
/** default constructor */
public GenomeLocSortedSet(GenomeLocParser parser) {
this.genomeLocParser = parser;
@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* Return the number of bps before loc in the sorted set
*
* @param loc the location before which we are counting bases
* @return
* @return the number of base pairs over all previous intervals
*/
public long sizeBeforeLoc(GenomeLoc loc) {
long s = 0;
@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
if ( e.isBefore(loc) )
s += e.size();
else if ( e.isPast(loc) )
; // don't do anything
break; // we are done
else // loc is inside of s
s += loc.getStart() - e.getStart();
}
@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* Determine if the given loc overlaps any loc in the sorted set
*
* @param loc the location to test
* @return
* @return trip if the location overlaps any loc
*/
public boolean overlaps(final GenomeLoc loc) {
for(final GenomeLoc e : mArray) {
if(e.overlapsP(loc)) {
return true;
}
// edge condition
if ( mArray.isEmpty() )
return false;
// use the cached version first
if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) )
return true;
// update the cached index
previousOverlapSearchIndex = Collections.binarySearch(mArray, loc);
// if it matches an interval exactly, we are done
if ( previousOverlapSearchIndex >= 0 )
return true;
// check whether it overlaps the interval before or after the insertion point
previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2);
return overlapsAtOrImmediatelyAfterCachedIndex(loc, false);
}
private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) {
// check the cached entry
if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) )
return true;
// check the entry after the cached entry since we may have moved to it
boolean returnValue = false;
if ( previousOverlapSearchIndex < mArray.size() - 1 ) {
returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc);
if ( updateCachedIndex )
previousOverlapSearchIndex++;
}
return false;
return returnValue;
}
/**
@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
mArray.add(e);
return true;
} else {
int loc = Collections.binarySearch(mArray,e);
final int loc = Collections.binarySearch(mArray,e);
if (loc >= 0) {
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
} else {

View File

@ -49,7 +49,9 @@ public class Haplotype {
private int alignmentStartHapwrtRef;
public int leftBreakPoint = 0;
public int rightBreakPoint = 0;
private Allele artificialAllele = null;
private int artificialAllelePosition = -1;
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
*
@ -71,6 +73,12 @@ public class Haplotype {
this(bases, 0);
}
protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) {
this(bases, 0);
this.artificialAllele = artificialAllele;
this.artificialAllelePosition = artificialAllelePosition;
}
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
this(bases);
this.genomeLocation = loc;
@ -171,8 +179,25 @@ public class Haplotype {
this.cigar = cigar;
}
public boolean isArtificialHaplotype() {
return artificialAllele != null;
}
public Allele getArtificialAllele() {
return artificialAllele;
}
public int getArtificialAllelePosition() {
return artificialAllelePosition;
}
public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) {
this.artificialAllele = artificialAllele;
this.artificialAllelePosition = artificialAllelePosition;
}
@Requires({"refInsertLocation >= 0"})
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
@ -182,7 +207,7 @@ public class Haplotype {
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
return new Haplotype(newHaplotypeBases);
return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation);
}
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {

View File

@ -293,6 +293,10 @@ public class Utils {
}
}
public static <T> String join(final String separator, final T ... objects) {
return join(separator, Arrays.asList(objects));
}
public static String dupString(char c, int nCopies) {
char[] chars = new char[nCopies];
Arrays.fill(chars, c);

View File

@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord;
import org.apache.commons.io.FilenameUtils;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
/**
@ -317,4 +322,33 @@ public class VCFUtils {
assembly = "hg19";
return assembly;
}
/**
* Read all of the VCF records from source into memory, returning the header and the VariantContexts
*
* @param source the file to read, must be in VCF4 format
* @return
* @throws IOException
*/
public static Pair<VCFHeader, List<VariantContext>> readVCF(final File source) throws IOException {
// read in the features
final List<VariantContext> vcs = new ArrayList<VariantContext>();
final VCFCodec codec = new VCFCodec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(source));
pbs.skip(header.getHeaderEnd());
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
while ( ! pbs.isDone() ) {
final VariantContext vc = codec.decode(pbs);
if ( vc != null )
vcs.add(vc);
}
return new Pair<VCFHeader, List<VariantContext>>(vcfHeader, vcs);
}
}

View File

@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
* Returns a new ReadBackedPileup where only one read from an overlapping read
* pair is retained. If the two reads in question disagree to their basecall,
* neither read is retained. If they agree on the base, the read with the higher
* quality observation is retained
* base quality observation is retained
*
* @return the newly filtered pileup
*/
@Override
public RBP getOverlappingFragmentFilteredPileup() {
public ReadBackedPileup getOverlappingFragmentFilteredPileup() {
return getOverlappingFragmentFilteredPileup(true, true);
}
/**
* Returns a new ReadBackedPileup where only one read from an overlapping read
* pair is retained. If discardDiscordant and the two reads in question disagree to their basecall,
* neither read is retained. Otherwise, the read with the higher
* quality (base or mapping, depending on baseQualNotMapQual) observation is retained
*
* @return the newly filtered pileup
*/
@Override
public RBP getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) {
if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
for (final String sample : tracker.getSamples()) {
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup<RBP, PE> pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup();
AbstractReadBackedPileup<RBP, PE> pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual);
filteredTracker.addElements(sample, pileup.pileupElementTracker);
}
return (RBP) createNewPileup(loc, filteredTracker);
@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
// if the reads disagree at this position, throw them both out. Otherwise
// keep the element with the higher quality score
if (existing.getBase() != p.getBase()) {
if (discardDiscordant && existing.getBase() != p.getBase()) {
filteredPileup.remove(readName);
} else {
if (existing.getQual() < p.getQual()) {
filteredPileup.put(readName, p);
if (baseQualNotMapQual) {
if (existing.getQual() < p.getQual())
filteredPileup.put(readName, p);
}
else {
if (existing.getMappingQual() < p.getMappingQual())
filteredPileup.put(readName, p);
}
}
}
@ -998,6 +1016,44 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
return quals2String(getQuals());
}
/**
* Returns a new ReadBackedPileup that is sorted by start coordinate of the reads.
*
* @return
*/
@Override
public RBP getStartSortedPileup() {
final TreeSet<PE> sortedElements = new TreeSet<PE>(new Comparator<PE>() {
@Override
public int compare(PE element1, PE element2) {
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
}
});
if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
for (final String sample : tracker.getSamples()) {
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
for (PE pile : perSampleElements)
sortedElements.add(pile);
}
}
else {
UnifiedPileupElementTracker<PE> tracker = (UnifiedPileupElementTracker<PE>) pileupElementTracker;
for (PE pile : tracker)
sortedElements.add(pile);
}
UnifiedPileupElementTracker<PE> sortedTracker = new UnifiedPileupElementTracker<PE>();
for (PE pile : sortedElements)
sortedTracker.add(pile);
return (RBP) createNewPileup(loc, sortedTracker);
}
@Override
public FragmentCollection<PileupElement> toFragments() {
return FragmentUtils.create(this);

View File

@ -60,6 +60,16 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
*/
public ReadBackedPileup getOverlappingFragmentFilteredPileup();
/**
* Returns a new ReadBackedPileup where only one read from an overlapping read
* pair is retained. If discardDiscordant and the two reads in question disagree to their basecall,
* neither read is retained. Otherwise, the read with the higher
* quality (base or mapping, depending on baseQualNotMapQual) observation is retained
*
* @return the newly filtered pileup
*/
public ReadBackedPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual);
/**
* Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this
* does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy
@ -261,6 +271,13 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
*/
public byte[] getMappingQuals();
/**
* Returns a new ReadBackedPileup that is sorted by start coordinate of the reads.
*
* @return
*/
public ReadBackedPileup getStartSortedPileup();
/**
* Converts this pileup into a FragmentCollection (see FragmentUtils for documentation)
* @return

View File

@ -49,7 +49,7 @@ import java.util.EnumSet;
public class CycleCovariate implements StandardCovariate {
private static final int MAXIMUM_CYCLE_VALUE = 1000;
private int MAXIMUM_CYCLE_VALUE;
private static final int CUSHION_FOR_INDELS = 4;
private String default_platform = null;
@ -59,6 +59,8 @@ public class CycleCovariate implements StandardCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE;
if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM))
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform.");
@ -88,6 +90,9 @@ public class CycleCovariate implements StandardCovariate {
final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1;
for (int i = 0; i < readLength; i++) {
if ( cycle > MAXIMUM_CYCLE_VALUE )
throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle was detected in read " + read.getReadName() + ". Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)");
final int substitutionKey = keyFromCycle(cycle);
final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey;
values.addCovariate(substitutionKey, indelKey, indelKey, i);

View File

@ -184,9 +184,6 @@ public class VariantContext implements Feature { // to enable tribble integratio
protected CommonInfo commonInfo = null;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
@Deprecated // ID is no longer stored in the attributes map
private final static String ID_KEY = "ID";
public final static Set<String> PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet<String>());
/** The location of this VariantContext */
@ -287,10 +284,6 @@ public class VariantContext implements Feature { // to enable tribble integratio
this.commonInfo = new CommonInfo(source, log10PError, filters, attributes);
// todo -- remove me when this check is no longer necessary
if ( this.commonInfo.hasAttribute(ID_KEY) )
throw new IllegalArgumentException("Trying to create a VariantContext with a ID key. Please use provided constructor argument ID");
if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); }
// we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles

View File

@ -979,6 +979,40 @@ public class VariantContextUtils {
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
/**
* Split variant context into its biallelic components if there are more than 2 alleles
*
* For VC has A/B/C alleles, returns A/B and A/C contexts.
* Genotypes are all no-calls now (it's not possible to fix them easily)
* Alleles are right trimmed to satisfy VCF conventions
*
* If vc is biallelic or non-variant it is just returned
*
* Chromosome counts are updated (but they are by definition 0)
*
* @param vc a potentially multi-allelic variant context
* @return a list of bi-allelic (or monomorphic) variant context
*/
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc) {
if ( ! vc.isVariant() || vc.isBiallelic() )
// non variant or biallelics already satisfy the contract
return Collections.singletonList(vc);
else {
final List<VariantContext> biallelics = new LinkedList<VariantContext>();
for ( final Allele alt : vc.getAlternateAlleles() ) {
VariantContextBuilder builder = new VariantContextBuilder(vc);
final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
calculateChromosomeCounts(builder, true);
biallelics.add(reverseTrimAlleles(builder.make()));
}
return biallelics;
}
}
/**
* subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
*

View File

@ -0,0 +1,358 @@
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: thibault
* Date: 11/13/12
* Time: 2:47 PM
*
* Test the Active Region Traversal Contract
* http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
*/
public class TraverseActiveRegionsTest extends BaseTest {
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
private final double prob;
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
public DummyActiveRegionWalker() {
this.prob = 1.0;
}
@Override
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
isActiveCalls.add(ref.getLocus());
return new ActivityProfileResult(ref.getLocus(), prob);
}
@Override
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
return 0;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}
private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegions<Integer, Integer>();
private IndexedFastaSequenceFile reference;
private SAMSequenceDictionary dictionary;
private GenomeLocParser genomeLocParser;
private List<GenomeLoc> intervals;
private List<SAMRecord> reads;
@BeforeClass
private void init() throws FileNotFoundException {
reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
dictionary = reference.getSequenceDictionary();
genomeLocParser = new GenomeLocParser(dictionary);
intervals = new ArrayList<GenomeLoc>();
intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
// TODO: this fails!
//intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 20000));
intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
reads = new ArrayList<SAMRecord>();
reads.add(buildSAMRecord("simple", "1", 100, 200));
reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
reads.add(buildSAMRecord("boundary_unequal", "1", 1995, 2050));
reads.add(buildSAMRecord("extended_only", "1", 3000, 3100));
reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
// TODO
//reads.add(buildSAMRecord("simple20", "20", 10100, 10150));
}
@Test
public void testAllBasesSeen() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
// Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
verifyEqualIntervals(intervals, activeIntervals);
// TODO: more tests and edge cases
}
private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) {
t.traverse(walker, dataProvider, 0);
activeIntervals.addAll(walker.isActiveCalls);
}
return activeIntervals;
}
@Test
public void testActiveRegionCoverage() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
verifyActiveRegionCoverage(intervals, activeRegions);
// TODO: more tests and edge cases
}
private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
for (GenomeLoc interval : intervals) {
intervalStarts.add(interval.getStartLocation());
intervalStops.add(interval.getStopLocation());
}
Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
for (ActiveRegion activeRegion : activeRegions) {
for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
// Contract: Regions do not overlap
Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
baseRegionMap.put(activeLoc, activeRegion);
}
GenomeLoc start = activeRegion.getLocation().getStartLocation();
if (intervalStarts.contains(start))
intervalStarts.remove(start);
GenomeLoc stop = activeRegion.getLocation().getStopLocation();
if (intervalStops.contains(stop))
intervalStops.remove(stop);
}
for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
// Contract: Each location in the interval(s) is in exactly one region
// Contract: The total set of regions exactly matches the analysis interval(s)
Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
baseRegionMap.remove(baseLoc);
}
// Contract: The total set of regions exactly matches the analysis interval(s)
Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
// Contract: All explicit interval boundaries must also be region boundaries
Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
}
@Test
public void testReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// Contract: Each read has the Non-Primary state in all other regions it overlaps
// Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_unequal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_only: Extended in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// TODO
// simple20: Primary in 20:10000-20000
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadPrimary(region, "simple");
verifyReadPrimary(region, "overlap_equal");
verifyReadPrimary(region, "overlap_unequal");
verifyReadNotPlaced(region, "boundary_equal");
verifyReadNotPlaced(region, "boundary_unequal");
verifyReadNotPlaced(region, "extended_only");
// TODO: fail verifyReadNonPrimary(region, "extended_and_np");
verifyReadNotPlaced(region, "outside_intervals");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadNotPlaced(region, "simple");
verifyReadNotPlaced(region, "overlap_equal");
verifyReadNotPlaced(region, "overlap_unequal");
// TODO: fail verifyReadPrimary(region, "boundary_equal");
// TODO: fail verifyReadNonPrimary(region, "boundary_unequal");
verifyReadNotPlaced(region, "extended_only");
// TODO: fail verifyReadPrimary(region, "extended_and_np");
verifyReadNotPlaced(region, "outside_intervals");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadNotPlaced(region, "simple");
verifyReadNotPlaced(region, "overlap_equal");
verifyReadNotPlaced(region, "overlap_unequal");
// TODO: fail verifyReadNonPrimary(region, "boundary_equal");
verifyReadPrimary(region, "boundary_unequal");
// TODO: fail verifyReadExtended(region, "extended_only");
// TODO: fail verifyReadExtended(region, "extended_and_np");
verifyReadNotPlaced(region, "outside_intervals");
// TODO: more tests and edge cases
}
private void verifyReadPrimary(ActiveRegion region, String readName) {
SAMRecord read = getRead(region, readName);
Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region);
}
private void verifyReadNonPrimary(ActiveRegion region, String readName) {
SAMRecord read = getRead(region, readName);
Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region);
}
private void verifyReadExtended(ActiveRegion region, String readName) {
Assert.fail("The Extended read state has not been implemented");
}
private void verifyReadNotPlaced(ActiveRegion region, String readName) {
for (SAMRecord read : region.getReads()) {
if (read.getReadName().equals(readName))
Assert.fail("Read " + readName + " found in active region " + region);
}
}
private SAMRecord getRead(ActiveRegion region, String readName) {
for (SAMRecord read : region.getReads()) {
if (read.getReadName().equals(readName))
return read;
}
Assert.fail("Read " + readName + " not found in active region " + region);
return null;
}
private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
for (LocusShardDataProvider dataProvider : createDataProviders(intervals))
t.traverse(walker, dataProvider, 0);
return walker.mappedActiveRegions;
}
private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
if (interval.size() == 1)
bases.add(interval);
else {
for (int location = interval.getStart(); location <= interval.getStop(); location++)
bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
}
return bases;
}
private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
Set<GenomeLoc> bases = new TreeSet<GenomeLoc>(); // for sorting and uniqueness
for (GenomeLoc interval : intervals)
bases.addAll(toSingleBaseLocs(interval));
return bases;
}
private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
Iterator<GenomeLoc> aIter = aBases.iterator();
Iterator<GenomeLoc> bIter = bBases.iterator();
while (aIter.hasNext() && bIter.hasNext()) {
GenomeLoc aLoc = aIter.next();
GenomeLoc bLoc = bIter.next();
Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
}
}
// copied from LocusViewTemplate
protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
SAMFileHeader header = new SAMFileHeader();
header.setSequenceDictionary(dictionary);
GATKSAMRecord record = new GATKSAMRecord(header);
record.setReadName(readName);
record.setReferenceIndex(dictionary.getSequenceIndex(contig));
record.setAlignmentStart(alignmentStart);
Cigar cigar = new Cigar();
int len = alignmentEnd - alignmentStart + 1;
cigar.add(new CigarElement(len, CigarOperator.M));
record.setCigar(cigar);
record.setReadBases(new byte[len]);
record.setBaseQualities(new byte[len]);
return record;
}
private List<LocusShardDataProvider> createDataProviders(List<GenomeLoc> intervals) {
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
engine.setGenomeLocParser(genomeLocParser);
t.initialize(engine);
StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(reads);
Shard shard = new MockLocusShard(genomeLocParser, intervals);
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) {
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
}
return providers;
}
}

View File

@ -151,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
}
@Test
public void testTabixAnnotations() {
public void testTabixAnnotationsAndParallelism() {
final String MD5 = "99938d1e197b8f10c408cac490a00a62";
for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) {
WalkerTestSpec spec = new WalkerTestSpec(
@ -159,6 +159,12 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
Arrays.asList(MD5));
executeTest("Testing lookup vcf tabix vs. vcf tribble", spec);
}
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1,
Arrays.asList(MD5));
executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec);
}
@Test

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeClass;
@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(loc.getContigIndex() == 1);
}
@Test
public void overlap() {
for ( int i = 1; i < 6; i++ ) {
final int start = i * 10;
mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1));
}
// test matches in and around interval
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11)));
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12)));
// test matches spanning intervals
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
// test miss
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19)));
// test exact match after miss
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41)));
// test matches at beginning of intervals
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10)));
// test matches at end of intervals
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
}
@Test
public void mergingOverlappingAbove() {
GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50);

View File

@ -159,7 +159,7 @@ public class HaplotypeUnitTest extends BaseTest {
final VariantContext vc = new VariantContextBuilder().alleles(alleles).loc("1", loc, loc + h1refAllele.getBases().length - 1).make();
h.setAlignmentStartHapwrtRef(0);
h.setCigar(cigar);
final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc);
final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc, vc.getStart());
final Haplotype h1expected = new Haplotype(newHap.getBytes());
Assert.assertEquals(h1, h1expected);
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -24,7 +25,7 @@ public class CycleCovariateUnitTest {
covariate.initialize(RAC);
}
@Test(enabled = false)
@Test(enabled = true)
public void testSimpleCycles() {
short readLength = 10;
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
@ -53,9 +54,31 @@ public class CycleCovariateUnitTest {
for (short i = 0; i < values.length; i++) {
short actual = Short.decode(covariate.formatKey(values[i][0]));
int expected = init + (increment * i);
// System.out.println(String.format("%d: %d, %d", i, actual, expected));
Assert.assertEquals(actual, expected);
}
}
@Test(enabled = true, expectedExceptions={UserException.class})
public void testMoreThanMaxCycleFails() {
int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1;
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
read.setReadPairedFlag(true);
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
read.getReadGroup().setPlatform("illumina");
ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
covariate.recordValues(read, readCovariates);
}
@Test(enabled = true)
public void testMaxCyclePasses() {
int readLength = RAC.MAXIMUM_CYCLE_VALUE;
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
read.setReadPairedFlag(true);
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
read.getReadGroup().setPlatform("illumina");
ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
covariate.recordValues(read, readCovariates);
}
}

View File

@ -782,7 +782,7 @@ public class VariantContextTestProvider {
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
Assert.assertEquals(actual.getID(), expected.getID(), "id");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");

View File

@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
@ -39,7 +39,7 @@ import java.io.FileNotFoundException;
import java.util.*;
public class VariantContextUtilsUnitTest extends BaseTest {
Allele Aref, T, C, Cref, ATC, ATCATC;
Allele Aref, T, C, G, Cref, ATC, ATCATC;
private GenomeLocParser genomeLocParser;
@BeforeSuite
@ -58,6 +58,7 @@ public class VariantContextUtilsUnitTest extends BaseTest {
Cref = Allele.create("C", true);
T = Allele.create("T");
C = Allele.create("C");
G = Allele.create("G");
ATC = Allele.create("ATC");
ATCATC = Allele.create("ATCATC");
}
@ -697,10 +698,120 @@ public class VariantContextUtilsUnitTest extends BaseTest {
return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class);
}
@Test(dataProvider = "ReverseClippingPositionTestProvider")
public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) {
int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false);
Assert.assertEquals(result, cfg.expectedClip);
}
}
// --------------------------------------------------------------------------------
//
// test splitting into bi-allelics
//
// --------------------------------------------------------------------------------
@DataProvider(name = "SplitBiallelics")
public Object[][] makeSplitBiallelics() throws CloneNotSupportedException {
List<Object[]> tests = new ArrayList<Object[]>();
final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C));
// biallelic -> biallelic
tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
// monos -> monos
root.alleles(Arrays.asList(Aref));
tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
root.alleles(Arrays.asList(Aref, C, T));
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Aref, C)).make(),
root.alleles(Arrays.asList(Aref, T)).make())});
root.alleles(Arrays.asList(Aref, C, T, G));
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Aref, C)).make(),
root.alleles(Arrays.asList(Aref, T)).make(),
root.alleles(Arrays.asList(Aref, G)).make())});
final Allele C = Allele.create("C");
final Allele CA = Allele.create("CA");
final Allele CAA = Allele.create("CAA");
final Allele CAAAA = Allele.create("CAAAA");
final Allele CAAAAA = Allele.create("CAAAAA");
final Allele Cref = Allele.create("C", true);
final Allele CAref = Allele.create("CA", true);
final Allele CAAref = Allele.create("CAA", true);
final Allele CAAAref = Allele.create("CAAA", true);
root.alleles(Arrays.asList(Cref, CA, CAA));
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Cref, CA)).make(),
root.alleles(Arrays.asList(Cref, CAA)).make())});
root.alleles(Arrays.asList(CAAref, C, CA)).stop(12);
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(CAAref, C)).make(),
root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13);
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(CAAAref, C)).make(),
root.alleles(Arrays.asList(CAAref, C)).stop(12).make(),
root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13);
tests.add(new Object[]{root.make(),
Arrays.asList(
root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(),
root.alleles(Arrays.asList(Cref, CA)).stop(10).make(),
root.alleles(Arrays.asList(CAref, C)).stop(11).make(),
root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SplitBiallelics")
public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc);
Assert.assertEquals(biallelics.size(), expectedBiallelics.size());
for ( int i = 0; i < biallelics.size(); i++ ) {
final VariantContext actual = biallelics.get(i);
final VariantContext expected = expectedBiallelics.get(i);
VariantContextTestProvider.assertEquals(actual, expected);
}
}
@Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes")
public void testSplitBiallelicsGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
final List<Genotype> genotypes = new ArrayList<Genotype>();
int sampleI = 0;
for ( final List<Allele> alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) {
genotypes.add(GenotypeBuilder.create("sample" + sampleI, alleles));
}
genotypes.add(GenotypeBuilder.createMissing("missing", 2));
final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make();
final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes);
for ( int i = 0; i < biallelics.size(); i++ ) {
final VariantContext actual = biallelics.get(i);
Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples
for ( final Genotype inputGenotype : genotypes ) {
final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName());
Assert.assertNotNull(actualGenotype);
if ( ! vc.isVariant() || vc.isBiallelic() )
Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName()));
else
Assert.assertTrue(actualGenotype.isNoCall());
}
}
}
}

View File

@ -0,0 +1,507 @@
package org.broadinstitute.sting.queue.qscripts.CNV
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.util.VCF_BAM_utilities
import org.broadinstitute.sting.queue.util.DoC._
import org.broadinstitute.sting.commandline.Hidden
import java.io.{PrintStream, PrintWriter}
import org.broadinstitute.sting.utils.text.XReadLines
import collection.JavaConversions._
import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils
class xhmmCNVpipeline extends QScript {
qscript =>
@Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true)
var bams: File = _
@Input(doc = "gatk jar file", shortName = "J", required = true)
var gatkJarFile: File = _
@Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
var xhmmExec: File = _
@Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
var pseqExec: File = _
@Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true)
var pseqSeqDB: String = _
@Input(shortName = "R", doc = "ref", required = true)
var referenceFile: File = _
@Input(shortName = "L", doc = "Intervals", required = false)
var intervals: File = _
@Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false)
var scatterCountInput = 0
@Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
var samplesPerJob = 1
@Output(doc = "Base name for files to output", shortName = "o", required = true)
var outputBase: File = _
@Hidden
@Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false)
var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS
@Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
var MAX_DEPTH = 20000
@Hidden
@Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
var NUM_BINS = 200
@Hidden
@Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
var START_BIN = 1
@Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
var minMappingQuality = 0
@Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false)
var minBaseQuality = 0
@Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
var wholeMatrixMemory = -1
@Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false)
var minTargGC : Double = 0.1
@Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false)
var maxTargGC : Double = 0.9
@Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false)
var minTargRepeats : Double = 0.0
@Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false)
var maxTargRepeats : Double = 0.1
@Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false)
var sampleIDsMap: String = ""
@Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false)
var sampleIDsMapFromColumn = 1
@Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false)
var sampleIDsMapToColumn = 2
@Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false)
var targetSampleFiltersString: String = ""
@Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false)
var PCAnormalizeMethodString: String = ""
@Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false)
var targetSampleNormalizedFiltersString: String = ""
@Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true)
var xhmmParamsArg: File = _
@Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false)
var discoverCommandLineParams: String = ""
@Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false)
var genotypeCommandLineParams: String = ""
@Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false)
var genotypeSubsegments: Boolean = false
@Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false)
var maxTargetsInSubsegment = 30
@Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false)
var subsegmentGenotypeThreshold = 20.0
@Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
var longJobQueue: String = ""
val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list"
val RD_OUTPUT_SUFFIX: String = ".RD.txt"
val TARGS_GC_SUFFIX = ".locus_GC.txt"
val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt"
val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt"
val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt"
val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt"
val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt"
trait WholeMatrixMemoryLimit extends CommandLineFunction {
// Since loading ALL of the data can take significant memory:
if (wholeMatrixMemory < 0) {
this.memoryLimit = 24
}
else {
this.memoryLimit = wholeMatrixMemory
}
}
trait LongRunTime extends CommandLineFunction {
if (longJobQueue != "")
this.jobQueue = longJobQueue
}
def script = {
val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile)
add(prepTargets)
trait CommandLineGATKArgs extends CommandLineGATK {
this.intervals :+= prepTargets.out
this.jarFile = qscript.gatkJarFile
this.reference_sequence = qscript.referenceFile
this.logging_level = "INFO"
}
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams))
val samples: List[String] = sampleToBams.keys.toList
Console.out.printf("Samples are %s%n", samples)
val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase)
var docs: List[DoC] = List[DoC]()
for (group <- groups) {
Console.out.printf("Group is %s%n", group)
docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
}
addAll(docs)
val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit
add(mergeDepths)
var excludeTargets : List[File] = List[File]()
if (minTargGC > 0 || maxTargGC < 1) {
val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs
calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX
add(calcGCcontents)
val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC)
add(excludeTargetsBasedOnGC)
excludeTargets ::= excludeTargetsBasedOnGC.out
}
class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction {
@Input(doc="")
var intervals: File = prepTargets.out
@Output(doc="")
var out : File = new File(outFile)
val regFile : String = outputBase.getPath + ".targets.reg"
val locDB : String = outputBase.getPath + ".targets.LOCDB"
val removeFiles = "rm -f " + regFile + " " + locDB
val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile
val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load"
val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats"
val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out
var command: String =
removeFiles +
" && " + createRegFile +
" && " + createLOCDB +
" && " + calcRepeatMaskedPercent +
" && " + extractRepeatMaskedPercent
def commandLine = command
override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command
}
if (minTargRepeats > 0 || maxTargRepeats < 1) {
val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX)
add(calcRepeatComplexity)
val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats)
add(excludeTargetsBasedOnRepeats)
excludeTargets ::= excludeTargetsBasedOnRepeats.out
}
val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets)
add(filterCenterDepths)
val pca = new PCA(filterCenterDepths.filteredCentered)
add(pca)
val normalize = new Normalize(pca)
add(normalize)
val filterZscore = new FilterAndZscoreNormalized(normalize.normalized)
add(filterZscore)
val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore)
add(filterOriginal)
val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered)
add(discover)
val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
add(genotype)
if (genotypeSubsegments) {
val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
add(genotypeSegs)
}
}
class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction {
@Input(doc="")
var locus_value : File = locus_valueIn
@Output(doc="")
var out : File = new File(outputBase.getPath + outSuffix)
def run = {
var outWriter = new PrintWriter(new PrintStream(out))
var elems = asScalaIterator(new XReadLines(locus_value))
while (elems.hasNext) {
val line = elems.next
val splitLine = line.split("\\s+")
val locus = splitLine(0)
val locValStr = splitLine(1)
try {
val locVal = locValStr.toDouble
if (locVal < minVal || locVal > maxVal)
outWriter.printf("%s%n", locus)
}
catch {
case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus)
case e: Exception => throw e
}
}
outWriter.close
}
}
class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit {
@Input(doc = "")
val input = inputParam
@Input(doc = "")
val excludeTargets = excludeTargetsIn
@Output
val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX)
@Output
val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX)
@Output
val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX)
var command: String =
xhmmExec + " --matrix" +
" -r " + input +
" --centerData --centerType target" +
" -o " + filteredCentered +
" --outputExcludedTargets " + filteredTargets +
" --outputExcludedSamples " + filteredSamples
command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _)
if (targetSampleFiltersString != "")
command += " " + targetSampleFiltersString
def commandLine = command
override def description = "Filters samples and targets and then mean-centers the targets: " + command
}
class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
@Input(doc = "")
val input = inputParam
val PCAbase: String = outputBase.getPath + ".RD_PCA"
@Output
val outPC: File = new File(PCAbase + ".PC.txt")
@Output
val outPC_SD: File = new File(PCAbase + ".PC_SD.txt")
@Output
val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt")
var command: String =
xhmmExec + " --PCA" +
" -r " + input +
" --PCAfiles " + PCAbase
def commandLine = command
override def description = "Runs PCA on mean-centered data: " + command
}
class Normalize(pca: PCA) extends CommandLineFunction {
@Input(doc = "")
val input = pca.input
@Input(doc = "")
val inPC = pca.outPC
@Input(doc = "")
val inPC_SD = pca.outPC_SD
@Input(doc = "")
val inPC_LOADINGS = pca.outPC_LOADINGS
@Output
val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt")
var command: String =
xhmmExec + " --normalize" +
" -r " + input +
" --PCAfiles " + pca.PCAbase +
" --normalizeOutput " + normalized
if (PCAnormalizeMethodString != "")
command += " " + PCAnormalizeMethodString
def commandLine = command
override def description = "Normalizes mean-centered data using PCA information: " + command
}
class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
@Input(doc = "")
val input = inputParam
@Output
val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX)
@Output
val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX)
@Output
val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX)
var command: String =
xhmmExec + " --matrix" +
" -r " + input +
" --centerData --centerType sample --zScoreData" +
" -o " + filteredZscored +
" --outputExcludedTargets " + filteredTargets +
" --outputExcludedSamples " + filteredSamples
if (targetSampleNormalizedFiltersString != "")
command += " " + targetSampleNormalizedFiltersString
def commandLine = command
override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command
}
class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit {
@Input(doc = "")
val input = inputParam
@Input(doc = "")
val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u))
@Input(doc = "")
val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u))
@Output
val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX)
var command: String =
xhmmExec + " --matrix" +
" -r " + input +
targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) +
sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) +
" -o " + sameFiltered
def commandLine = command
override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command
}
class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
@Input(doc = "")
val input = inputParam
@Input(doc = "")
val xhmmParams = xhmmParamsArg
@Input(doc = "")
val origRD = origRDParam
@Output
val xcnv: File = new File(outputBase.getPath + ".xcnv")
@Output
val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv")
val posteriorsBase = outputBase.getPath
@Output
val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt")
@Output
val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt")
@Output
val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt")
var command: String =
xhmmExec + " --discover" +
" -p " + xhmmParams +
" -r " + input +
" -R " + origRD +
" -c " + xcnv +
" -a " + aux_xcnv +
" -s " + posteriorsBase +
" " + discoverCommandLineParams
def commandLine = command
override def description = "Discovers CNVs in normalized data: " + command
}
abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
@Input(doc = "")
val input = inputParam
@Input(doc = "")
val xhmmParams = xhmmParamsArg
@Input(doc = "")
val origRD = origRDParam
@Input(doc = "")
val inXcnv = xcnv
var command: String =
xhmmExec + " --genotype" +
" -p " + xhmmParams +
" -r " + input +
" -g " + inXcnv +
" -F " + referenceFile +
" -R " + origRD +
" " + genotypeCommandLineParams
}
class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
@Output
val vcf: File = new File(outputBase.getPath + ".vcf")
command +=
" -v " + vcf
def commandLine = command
override def description = "Genotypes discovered CNVs in all samples: " + command
}
class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
@Output
val vcf: File = new File(outputBase.getPath + ".subsegments.vcf")
command +=
" -v " + vcf +
" --subsegments" +
" --maxTargetsInSubsegment " + maxTargetsInSubsegment +
" --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold
def commandLine = command
override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command
}
}

View File

@ -147,13 +147,13 @@ class GATKResourcesBundle extends QScript {
//
// example call set for wiki tutorial
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf",
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf",
"NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true))
//
// Test BAM file, specific to each reference
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam",
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam",
"IGNORE", b37, false, false))
//

View File

@ -0,0 +1,131 @@
package org.broadinstitute.sting.queue.util
import java.io.File
import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK}
import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction
import org.broadinstitute.sting.gatk.downsampling.DownsampleType
import org.broadinstitute.sting.commandline.{Input, Gather, Output}
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils
package object DoC {
class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"
// So that the output files of this DoC run get deleted once they're used further downstream:
this.isIntermediate = true
this.analysis_type = "DepthOfCoverage"
this.input_file = bams
this.downsample_to_coverage = Some(MAX_DEPTH)
this.downsampling_type = DownsampleType.BY_SAMPLE
this.scatterCount = scatterCountInput
this.scatterClass = classOf[IntervalScatterFunction]
// HACK for DoC to work properly within Queue:
@Output
@Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX)
override def commandLine = super.commandLine +
" --omitDepthOutputAtEachBase" +
" --omitLocusTable" +
" --minMappingQuality " + minMappingQuality +
" --minBaseQuality " + minBaseQuality +
optional("--countType", countType, spaceSeparated=true, escape=true, format="%s") +
" --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS +
(if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") +
" --includeRefNSites" +
" -o " + DoC_output
override def shortDescription = "DoC: " + DoC_output
}
class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality: Int, minBaseQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
// HACK for DoC to work properly within Queue:
@Output
@Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
var outPrefix = DoC_output
override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "")
}
def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = {
var l: List[Group] = Nil
var remaining = samples
var subsamples: List[String] = Nil
var count = 1
while (!remaining.isEmpty) {
val splitRes = (remaining splitAt samplesPerJob)
subsamples = splitRes._1
remaining = splitRes._2
l ::= new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams))
count = count + 1
}
return l
}
// A group has a list of samples and bam files to use for DoC
class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) {
// getName() just includes the file name WITHOUT the path:
val groupOutputName = name + "." + outputBase.getName
// Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! :
//def DoC_output = new File(outputBase.getParentFile(), groupOutputName)
def DoC_output = new File(groupOutputName)
override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams)
}
class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction {
@Input(doc = "")
var inputDoCfiles: List[File] = DoCsToCombine
@Output
val mergedDoC: File = new File(outFile)
var command: String =
xhmmExec + " --mergeGATKdepths" +
inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) +
" --columnSuffix " + columnSuffix +
" -o " + mergedDoC
if (sampleIDsMap != "")
command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn
rdPrecisionArg match {
case Some(rdPrecision) => {
command += " --rdPrecision " + rdPrecision
}
case None => {}
}
if (outputTargetsBySamples)
command += " --outputTargetsBySamples"
def commandLine = command
override def description = "Combines DoC outputs for multiple samples (at same loci): " + command
}
class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction {
@Input(doc = "List of files containing targeted intervals to be prepared and merged")
var inIntervals: List[File] = intervalsIn
@Output(doc = "The merged intervals file to write to")
var out: File = new File(outIntervals)
var command: String =
xhmmExec + " --prepareTargets" +
" -F " + referenceFile +
inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) +
" --mergedTargets " + out
def commandLine = command
override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command
}
}

View File

@ -26,36 +26,31 @@ object VCF_BAM_utilities {
case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
}
def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
case x :: y =>
val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y)
val bamSamples: List[String] = getSamplesInBAM(x)
def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = {
var m = scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
for (bam <- bams) {
val bamSamples: List[String] = getSamplesInBAM(bam)
for (s <- bamSamples) {
if (!m.contains(s))
m += s -> scala.collection.mutable.Set.empty[File]
m(s) = m(s) + x
m(s) += bam
}
}
return m
}
def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
var s = scala.collection.mutable.Set.empty[File]
def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match {
case Nil => scala.collection.mutable.Set.empty[File]
case x :: y =>
var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
if (sampleToBams.contains(x))
bamsForSampleX = sampleToBams(x)
return bamsForSampleX ++ findBAMsForSamplesHelper(y)
for (sample <- samples) {
if (sampleToBams.contains(sample))
s ++= sampleToBams(sample)
}
val l: List[File] = Nil
return l ++ findBAMsForSamplesHelper(samples)
return l ++ s
}
}