Merge branch 'develop' of https://github.com/broadinstitute/cmi-gatk into develop
This commit is contained in:
commit
3db877f4f2
24
build.xml
24
build.xml
|
|
@ -899,7 +899,7 @@
|
|||
|
||||
|
||||
<!-- Build a package consisting of all supporting files. Don't call this target directly. Call one of the specific packaging targets below -->
|
||||
<target name="package" depends="dist,stage,require.executable" description="bundle up an executable for distribution">
|
||||
<target name="package" depends="require.clean,dist,stage,require.executable" description="bundle up an executable for distribution">
|
||||
<mkdir dir="${package.output.dir}" />
|
||||
<xslt destdir="${package.output.dir}" style="${package.xml.dir}/CreatePackager.xsl" useImplicitFileset="false">
|
||||
<flattenmapper/>
|
||||
|
|
@ -1023,6 +1023,24 @@
|
|||
<delete dir="${pipelinetest.dir}"/>
|
||||
</target>
|
||||
|
||||
<!-- Depend on this target if your target requires a clean working directory but you don't want to depend on clean directly -->
|
||||
<target name="require.clean">
|
||||
<condition property="not.clean">
|
||||
<or>
|
||||
<available file="${build.dir}" />
|
||||
<available file="${lib.dir}" />
|
||||
<available file="${contract.dump.dir}" />
|
||||
<available file="${staging.dir}" />
|
||||
<available file="${dist.dir}" />
|
||||
<available file="${pipelinetest.dir}" />
|
||||
<available file="${javadoc.dir}" />
|
||||
<available file="${scaladoc.dir}" />
|
||||
<available file="${gatkdocs.dir}" />
|
||||
</or>
|
||||
</condition>
|
||||
<fail message="Selected build target requires a clean working directory. Run ant clean and then try again." if="not.clean" />
|
||||
</target>
|
||||
|
||||
|
||||
<!-- ******************************************************************************** -->
|
||||
<!-- gsalib -->
|
||||
|
|
@ -1186,14 +1204,16 @@
|
|||
<echo message="" />
|
||||
<echo message="Sting: Running @{testtype} test cases!"/>
|
||||
|
||||
<!-- no test is allowed to run for more than 10 hours -->
|
||||
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
|
||||
<testng outputDir="@{outputdir}"
|
||||
classpathref="${testng.classpath}"
|
||||
haltOnFailure="false" failureProperty="test.failure"
|
||||
verbose="2"
|
||||
timeout="36000000"
|
||||
workingDir="${basedir}"
|
||||
useDefaultListeners="false"
|
||||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
|
||||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
|
||||
<jvmarg value="-Xmx${test.maxmemory}" />
|
||||
<jvmarg value="-ea" />
|
||||
<jvmarg value="-Djava.awt.headless=true" />
|
||||
|
|
|
|||
|
|
@ -67,7 +67,6 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
|
|
@ -78,12 +77,21 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
}
|
||||
});
|
||||
|
||||
// make a listing of allele counts
|
||||
final int[] alleleCounts = new int[4];
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
|
||||
// do smart down-sampling
|
||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
if ( alleleList.size() <= numReadsToRemove )
|
||||
logAllElements(alleleList, log);
|
||||
// if we don't need to remove any reads, keep them all
|
||||
if ( alleleList.size() <= targetAlleleCounts[i] )
|
||||
elementsToKeep.addAll(alleleList);
|
||||
else
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
|
|
@ -93,6 +101,66 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
}
|
||||
|
||||
private static int scoreAlleleCounts(final int[] alleleCounts) {
|
||||
if ( alleleCounts.length < 2 )
|
||||
return 0;
|
||||
|
||||
// sort the counts (in ascending order)
|
||||
final int[] alleleCountsCopy = alleleCounts.clone();
|
||||
Arrays.sort(alleleCountsCopy);
|
||||
|
||||
final int maxCount = alleleCountsCopy[alleleCounts.length - 1];
|
||||
final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2];
|
||||
|
||||
int remainderCount = 0;
|
||||
for ( int i = 0; i < alleleCounts.length - 2; i++ )
|
||||
remainderCount += alleleCountsCopy[i];
|
||||
|
||||
// try to get the best score:
|
||||
// - in the het case the counts should be equal with nothing else
|
||||
// - in the hom case the non-max should be zero
|
||||
return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes an allele biased version of the given pileup
|
||||
*
|
||||
* @param alleleCounts the original pileup
|
||||
* @param numReadsToRemove fraction of total reads to remove per allele
|
||||
* @return allele biased pileup
|
||||
*/
|
||||
protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
|
||||
final int numAlleles = alleleCounts.length;
|
||||
|
||||
int maxScore = scoreAlleleCounts(alleleCounts);
|
||||
int[] alleleCountsOfMax = alleleCounts;
|
||||
|
||||
final int numReadsToRemovePerAllele = numReadsToRemove / 2;
|
||||
|
||||
for ( int i = 0; i < numAlleles; i++ ) {
|
||||
for ( int j = i; j < numAlleles; j++ ) {
|
||||
final int[] newCounts = alleleCounts.clone();
|
||||
|
||||
// split these cases so we don't lose on the floor (since we divided by 2)
|
||||
if ( i == j ) {
|
||||
newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove);
|
||||
} else {
|
||||
newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele);
|
||||
newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele);
|
||||
}
|
||||
|
||||
final int score = scoreAlleleCounts(newCounts);
|
||||
|
||||
if ( score < maxScore ) {
|
||||
maxScore = score;
|
||||
alleleCountsOfMax = newCounts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return alleleCountsOfMax;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
|
||||
*
|
||||
|
|
@ -102,7 +170,15 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
* @return the list of pileup elements TO KEEP
|
||||
*/
|
||||
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
|
||||
if ( numElementsToRemove == 0 )
|
||||
return elements;
|
||||
|
||||
final int pileupSize = elements.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllElements(elements, log);
|
||||
return new ArrayList<PileupElement>(0);
|
||||
}
|
||||
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
|
|
@ -132,15 +208,25 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
|
||||
totalReads += reads.size();
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
int numReadsToRemove = (int)(totalReads * downsamplingFraction);
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove * alleleReadMap.size());
|
||||
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) {
|
||||
if ( reads.size() <= numReadsToRemove ) {
|
||||
readsToRemove.addAll(reads);
|
||||
logAllReads(reads, log);
|
||||
} else {
|
||||
readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
|
||||
|
||||
// make a listing of allele counts
|
||||
final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
|
||||
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
|
||||
final int numAlleles = alleles.size();
|
||||
final int[] alleleCounts = new int[numAlleles];
|
||||
for ( int i = 0; i < numAlleles; i++ )
|
||||
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
|
||||
|
||||
// do smart down-sampling
|
||||
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
|
||||
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
|
||||
for ( int i = 0; i < numAlleles; i++ ) {
|
||||
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
|
||||
|
||||
if ( alleleBin.size() > targetAlleleCounts[i] ) {
|
||||
readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -156,13 +242,22 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
* @return the list of pileup elements TO REMOVE
|
||||
*/
|
||||
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
|
||||
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
|
||||
|
||||
if ( numElementsToRemove == 0 )
|
||||
return readsToRemove;
|
||||
|
||||
final int pileupSize = reads.size();
|
||||
if ( numElementsToRemove == pileupSize ) {
|
||||
logAllReads(reads, log);
|
||||
return reads;
|
||||
}
|
||||
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( itemsToRemove.get(i) ) {
|
||||
final GATKSAMRecord read = reads.get(i);
|
||||
|
|
|
|||
|
|
@ -253,7 +253,6 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
|
||||
// todo -- rework the whole NO_PG_TAG thing
|
||||
final boolean preSorted = true;
|
||||
final boolean indexOnTheFly = true;
|
||||
final boolean keep_records = true;
|
||||
|
|
|
|||
|
|
@ -220,7 +220,6 @@ public class SlidingWindow {
|
|||
regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
|
||||
}
|
||||
|
||||
// todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions
|
||||
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
||||
readsInWindow.pollFirst();
|
||||
}
|
||||
|
|
@ -607,9 +606,7 @@ public class SlidingWindow {
|
|||
toRemove.add(read);
|
||||
}
|
||||
}
|
||||
for (GATKSAMRecord read : toRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
removeReadsFromWindow(toRemove);
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
|
@ -805,9 +802,8 @@ public class SlidingWindow {
|
|||
hetReads.add(finalizeRunningConsensus());
|
||||
}
|
||||
|
||||
for (GATKSAMRecord read : toRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
removeReadsFromWindow(toRemove);
|
||||
|
||||
return hetReads;
|
||||
}
|
||||
|
||||
|
|
@ -924,5 +920,11 @@ public class SlidingWindow {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void removeReadsFromWindow (List<GATKSAMRecord> readsToRemove) {
|
||||
for (GATKSAMRecord read : readsToRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,6 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -42,163 +41,26 @@ import java.util.*;
|
|||
public class GenotypingEngine {
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
public GenotypingEngine( final boolean DEBUG ) {
|
||||
this.DEBUG = DEBUG;
|
||||
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// WARN
|
||||
// This function is the streamlined approach, currently not being used by default
|
||||
// WARN
|
||||
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
|
||||
// WARN
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser ) {
|
||||
// Prepare the list of haplotype indices to genotype
|
||||
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
|
||||
}
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
|
||||
|
||||
// Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
|
||||
haplotypesToRemove.add(h);
|
||||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
|
||||
if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.put(call.getAllele(h.getBases()), list);
|
||||
}
|
||||
returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
|
||||
return returnVCs;
|
||||
}
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
}
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
startPosKeySet.addAll(h.getEventMap().keySet());
|
||||
}
|
||||
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int myNumHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
|
||||
// using the allele mapping object translate the haplotype allele into the event allele
|
||||
final Genotype g = new GenotypeBuilder(sample)
|
||||
.alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
|
||||
.phased(loc != startPosKeySet.first())
|
||||
.PL(genotypeLikelihoods).make();
|
||||
myGenotypes.add(g);
|
||||
}
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
|
||||
new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
// BUGBUG: Create a class to hold this complicated return type
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final ArrayList<VariantContext> activeAllelesToGenotype ) {
|
||||
public List<Pair<VariantContext, Map<Allele, List<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final List<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
|
|
@ -207,7 +69,7 @@ public class GenotypingEngine {
|
|||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
|
|
@ -217,10 +79,10 @@ public class GenotypingEngine {
|
|||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
|
||||
if( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
|
|
@ -232,7 +94,7 @@ public class GenotypingEngine {
|
|||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
if( !in_GGA_mode ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
|
|
@ -261,7 +123,14 @@ public class GenotypingEngine {
|
|||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
|
||||
final ArrayList<Allele> alleleOrdering = new ArrayList<Allele>(alleleMapper.size());
|
||||
alleleOrdering.add(refAllele);
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
alleleOrdering.add(vc.getAlternateAllele(0));
|
||||
}
|
||||
|
||||
// Sanity check the priority list
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
|
|
@ -283,11 +152,15 @@ public class GenotypingEngine {
|
|||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
// let's update the Allele keys in the mapper because they can change after merging when there are complex events
|
||||
Map<Allele, List<Haplotype>> updatedAlleleMapper = new HashMap<Allele, List<Haplotype>>(alleleMapper.size());
|
||||
for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) {
|
||||
final Allele oldAllele = alleleOrdering.get(i);
|
||||
final Allele newAllele = mergedVC.getAlleles().get(i);
|
||||
updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele));
|
||||
alleleOrdering.set(i, newAllele);
|
||||
}
|
||||
alleleMapper = updatedAlleleMapper;
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
|
|
@ -299,7 +172,7 @@ public class GenotypingEngine {
|
|||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int numHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
|
|
@ -313,23 +186,23 @@ public class GenotypingEngine {
|
|||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
|
||||
// also, need to update the allele -> haplotype mapping
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
final HashMap<Allele, List<Haplotype>> alleleHashMapTrim = new HashMap<Allele, List<Haplotype>>();
|
||||
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii)));
|
||||
}
|
||||
|
||||
call = vcCallTrim;
|
||||
alleleHashMap = alleleHashMapTrim;
|
||||
alleleMapper = alleleHashMapTrim;
|
||||
}
|
||||
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
returnCalls.add( new Pair<VariantContext, Map<Allele,List<Haplotype>>>(call, alleleMapper) );
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
|
|
@ -348,7 +221,7 @@ public class GenotypingEngine {
|
|||
haplotypes.removeAll(haplotypesToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
final int MAX_SIZE_TO_COMBINE = 15;
|
||||
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
|
||||
if( startPosKeySet.size() <= 1 ) { return; }
|
||||
|
|
@ -395,7 +268,9 @@ public class GenotypingEngine {
|
|||
final ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
|
||||
haplotypeList.add(h);
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0];
|
||||
final HashSet<String> sampleSet = new HashSet<String>(1);
|
||||
sampleSet.add(sample);
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0];
|
||||
if( thisHapVC == null ) {
|
||||
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
|
||||
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
|
||||
|
|
@ -489,37 +364,87 @@ public class GenotypingEngine {
|
|||
|
||||
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
|
||||
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
|
||||
protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
|
||||
final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
|
||||
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
|
||||
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>(eventsAtThisLoc.size()+1);
|
||||
final Allele refAllele = eventsAtThisLoc.get(0).getReference();
|
||||
alleleMapper.put(refAllele, new ArrayList<Haplotype>());
|
||||
for( final VariantContext vc : eventsAtThisLoc )
|
||||
alleleMapper.put(vc.getAlternateAllele(0), new ArrayList<Haplotype>());
|
||||
|
||||
final ArrayList<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
alleleMapper.get(refAllele).add(h);
|
||||
} else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) {
|
||||
alleleMapper.get(h.getArtificialAllele()).add(h);
|
||||
} else {
|
||||
boolean foundInEventList = false;
|
||||
boolean haplotypeIsDetermined = false;
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
foundInEventList = true;
|
||||
alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h);
|
||||
haplotypeIsDetermined = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
}
|
||||
|
||||
if( !haplotypeIsDetermined )
|
||||
undeterminedHaplotypes.add(h);
|
||||
}
|
||||
}
|
||||
alleleMapper.add(refList);
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
list.add(h);
|
||||
|
||||
for( final Haplotype h : undeterminedHaplotypes ) {
|
||||
Allele matchingAllele = null;
|
||||
for( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
|
||||
// don't test against the reference allele
|
||||
if( alleleToTest.getKey().equals(refAllele) )
|
||||
continue;
|
||||
|
||||
final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
|
||||
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
|
||||
matchingAllele = alleleToTest.getKey();
|
||||
break;
|
||||
}
|
||||
}
|
||||
alleleMapper.add(list);
|
||||
|
||||
if( matchingAllele == null )
|
||||
matchingAllele = refAllele;
|
||||
alleleMapper.get(matchingAllele).add(h);
|
||||
}
|
||||
|
||||
return alleleMapper;
|
||||
}
|
||||
|
||||
protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
|
||||
|
||||
for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
|
||||
final VariantContext fromSuperset = superset.get(fromSubset.getKey());
|
||||
if ( fromSuperset == null )
|
||||
return false;
|
||||
|
||||
List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
|
||||
if ( resolveSupersetToSubset )
|
||||
supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
|
||||
|
||||
if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
|
||||
if ( targetReference.length() <= actualReference.length() )
|
||||
return currentAlleles;
|
||||
|
||||
final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
|
||||
final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
|
||||
for ( final Allele a : currentAlleles ) {
|
||||
newAlleles.add(Allele.extend(a, extraBases));
|
||||
}
|
||||
return newAlleles;
|
||||
}
|
||||
|
||||
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
|
||||
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
|
||||
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -41,7 +40,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
|
|
@ -129,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 1;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
|
||||
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
|
||||
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
|
||||
protected int gcpHMM = 10;
|
||||
|
|
@ -212,7 +206,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 900;
|
||||
|
|
@ -246,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
|
||||
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.CONTAMINATION_FRACTION = 0.0;
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
|
|
@ -271,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
VCFConstants.GENOTYPE_QUALITY_KEY,
|
||||
VCFConstants.DEPTH_KEY,
|
||||
VCFConstants.GENOTYPE_PL_KEY);
|
||||
// header lines for the experimental HaplotypeCaller-specific annotations
|
||||
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
|
||||
|
||||
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
|
||||
// where the filters are used. For example, in emitting all sites the lowQual field is used
|
||||
|
|
@ -296,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -324,15 +310,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
return new ActivityProfileResult(1.0);
|
||||
return new ActivityProfileResult(ref.getLocus(), 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
if( USE_ALLELES_TRIGGER ) {
|
||||
return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
|
||||
return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
|
||||
}
|
||||
|
||||
if( context == null ) { return new ActivityProfileResult(0.0); }
|
||||
if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); }
|
||||
|
||||
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
|
@ -369,7 +355,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
|
||||
|
||||
return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
|
||||
return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -419,52 +405,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
|
||||
|
||||
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
|
||||
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
|
||||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
for( final Pair<VariantContext, Map<Allele, List<Haplotype>>> callResult :
|
||||
genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
|
||||
if( !GENOTYPE_FULL_ACTIVE_REGION ) {
|
||||
// add some custom annotations to the calls
|
||||
|
||||
// Calculate the number of variants on the haplotype
|
||||
int maxNumVar = 0;
|
||||
for( final Allele allele : callResult.getFirst().getAlleles() ) {
|
||||
if( !allele.isReference() ) {
|
||||
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
|
||||
final int numVar = haplotype.getEventMap().size();
|
||||
if( numVar > maxNumVar ) { maxNumVar = numVar; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// Calculate the event length
|
||||
int maxLength = 0;
|
||||
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
|
||||
final int length = a.length() - annotatedCall.getReference().length();
|
||||
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
|
||||
}
|
||||
|
||||
myAttributes.put("NVH", maxNumVar);
|
||||
myAttributes.put("NumHapEval", bestHaplotypes.size());
|
||||
myAttributes.put("NumHapAssembly", haplotypes.size());
|
||||
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
|
||||
myAttributes.put("EVENTLENGTH", maxLength);
|
||||
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
|
||||
myAttributes.put("extType", annotatedCall.getType().toString() );
|
||||
|
||||
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
|
||||
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
|
||||
//}
|
||||
if( annotatedCall.hasAttribute("QD") ) {
|
||||
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
|
||||
}
|
||||
|
||||
|
|
@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
// protect against INTERVALS with abnormally high coverage
|
||||
// BUGBUG: remove when positinal downsampler is hooked up to ART/HC
|
||||
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
activeRegion.add(clippedRead);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -148,45 +148,31 @@ public class LikelihoodCalculationEngine {
|
|||
return Math.min(b1.length, b2.length);
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
|
||||
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
|
||||
}
|
||||
|
||||
// This function takes just a single sample and a haplotypeMapping
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
|
||||
final TreeSet<String> sampleSet = new TreeSet<String>();
|
||||
sampleSet.add(sample);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering);
|
||||
}
|
||||
|
||||
// This function takes a set of samples to pool over and a haplotypeMapping
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
|
||||
|
||||
final int numHaplotypes = haplotypeMapping.size();
|
||||
final int numHaplotypes = alleleOrdering.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) {
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
|
|
@ -200,12 +186,48 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
// This function takes a set of samples to pool over and a haplotypeMapping
|
||||
@Requires({"haplotypeList.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
|
||||
|
||||
final int numHaplotypes = haplotypeList.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
// todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
final Haplotype iii_haplotype = haplotypeList.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
final Haplotype jjj_haplotype = haplotypeList.get(jjj);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample);
|
||||
final int[] readCounts_iii = iii_haplotype.getReadCounts(sample);
|
||||
final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
|
|
@ -296,14 +318,7 @@ public class LikelihoodCalculationEngine {
|
|||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
|
|
@ -347,7 +362,7 @@ public class LikelihoodCalculationEngine {
|
|||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
|
||||
final Pair<VariantContext, Map<Allele,List<Haplotype>>> call,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
|
|
|
|||
|
|
@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
|
||||
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
|
||||
// for GGA mode, add the desired allele into the haplotype
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
|
||||
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
return returnHaplotypes;
|
||||
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
|
||||
|
|
@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
|
||||
|
||||
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
|
||||
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( !activeAllelesToGenotype.isEmpty() ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
|
||||
|
||||
// This if statement used to additionally have:
|
||||
// "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
|
||||
// but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
|
||||
// a haplotype that already contains a 1bp insertion (so practically it is reference but
|
||||
// falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
|
||||
if( vcOnHaplotype == null ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
|
||||
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -369,6 +379,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
|
||||
if ( haplotype.isArtificialHaplotype() )
|
||||
h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition());
|
||||
h.leftBreakPoint = leftBreakPoint;
|
||||
h.rightBreakPoint = rightBreakPoint;
|
||||
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
|
||||
|
|
|
|||
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
/**
|
||||
* Basic unit test for AlleleBiasedDownsamplingUtils
|
||||
*/
|
||||
public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testSmartDownsampling() {
|
||||
|
||||
final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50};
|
||||
final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0};
|
||||
|
||||
// no contamination, no removal
|
||||
testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
|
||||
// hom sample, het contaminant, different alleles
|
||||
testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
|
||||
// hom sample, hom contaminant, different alleles
|
||||
testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
|
||||
|
||||
// het sample, het contaminant, different alleles
|
||||
testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
|
||||
// het sample, hom contaminant, different alleles
|
||||
testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
|
||||
// hom sample, het contaminant, overlapping alleles
|
||||
final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0};
|
||||
testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
|
||||
testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
|
||||
testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
|
||||
|
||||
// hom sample, hom contaminant, overlapping alleles
|
||||
testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0});
|
||||
|
||||
// het sample, het contaminant, overlapping alleles
|
||||
testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55});
|
||||
testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
|
||||
// het sample, hom contaminant, overlapping alleles
|
||||
testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
|
||||
}
|
||||
|
||||
private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction,
|
||||
final int pileupSize, final int[] initialCounts, final int[] targetCounts) {
|
||||
|
||||
final int[] actualCounts = initialCounts.clone();
|
||||
actualCounts[0] += addA;
|
||||
actualCounts[1] += addC;
|
||||
actualCounts[2] += addG;
|
||||
actualCounts[3] += addT;
|
||||
|
||||
final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction));
|
||||
Assert.assertTrue(countsAreEqual(results, targetCounts));
|
||||
}
|
||||
|
||||
private static boolean countsAreEqual(final int[] counts1, final int[] counts2) {
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
if ( counts1[i] != counts2[i] )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -51,16 +51,16 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
|
||||
String HiSeqInterval = "chr1:10,000,000-10,100,000";
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "4fd3c9ad97e6ac58cba644a76564c9f7")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "2620f734cce20f70ce13afd880e46e5c")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "5eb3b94e767da19a4c037ee132e4b19a")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab261d291b107a3da7897759c0e4fa89")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "292303f649fbb19dc05d4a0197a49eeb")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "8ced9d1094493f17fb1876b818a64541")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "abb838131e403d39820dbd66932d1ed0")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "8f62aa0e75770204c98d8299793cc53c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},
|
||||
|
|
@ -90,6 +90,21 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
executeTest("testBQSRFailWithoutDBSNP", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBQSRCSV() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
" -T BaseRecalibrator" +
|
||||
" -R " + b36KGReference +
|
||||
" -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
|
||||
" -knownSites " + b36dbSNP129 +
|
||||
" -L 1:10,000,000-10,200,000" +
|
||||
" -o /dev/null" +
|
||||
" --plot_pdf_file /dev/null" +
|
||||
" --intermediate_csv_file %s",
|
||||
Arrays.asList("d1c38a3418979400630e2bca1140689c"));
|
||||
executeTest("testBQSR-CSVfile", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBQSRFailWithSolidNoCall() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("cdec335abc9ad8e59335e39a73e0e95a"));
|
||||
Arrays.asList("847605f4efafef89529fe0e496315edd"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("efddb5e258f97fd4f6661cff9eaa57de"));
|
||||
Arrays.asList("5b31b811072a4df04524e13604015f9b"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("24532eb381724cd74e99370da28d49ed"));
|
||||
Arrays.asList("d9992e55381afb43742cc9b30fcd7538"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("062a946160eec1d0fc135d58ca654ff4"));
|
||||
Arrays.asList("fea530fdc8677e10be4cc11625fa5376"));
|
||||
executeTest("test SingleSample Pilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("94dc17d76d841f1d3a36160767ffa034"));
|
||||
Arrays.asList("704888987baacff8c7b273b8ab9938d0"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("9106d01ca0d0a8fedd068e72d509f380"));
|
||||
Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("d847acf841ba8ba653f996ce4869f439"));
|
||||
Arrays.asList("fb204e821a24d03bd3a671b6e01c449a"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "6792419c482e767a3deb28913ed2b1ad";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "5b8f477c287770b5769b05591e35bc2d";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinBaseQualityScore() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
|
||||
Arrays.asList("56157d930da6ccd224bce1ca93f11e41"));
|
||||
Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff"));
|
||||
executeTest("test min_base_quality_score 26", spec);
|
||||
}
|
||||
|
||||
|
|
@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSLOD() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("6ccb9bd88934e4272d0ce362dd35e603"));
|
||||
Arrays.asList("55760482335497086458b09e415ecf54"));
|
||||
executeTest("test SLOD", spec);
|
||||
}
|
||||
|
||||
|
|
@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testNDA() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("480437dd6e2760f4ab3194431519f331"));
|
||||
Arrays.asList("938e888a40182878be4c3cc4859adb69"));
|
||||
executeTest("test NDA", spec);
|
||||
}
|
||||
|
||||
|
|
@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testCompTrack() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("22c039412fd387dde6125b07c9a74a25"));
|
||||
Arrays.asList("7dc186d420487e4e156a24ec8dea0951"));
|
||||
executeTest("test using comp track", spec);
|
||||
}
|
||||
|
||||
|
|
@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testOutputParameterSitesOnly() {
|
||||
testOutputParameters("-sites_only", "40aeb4c9e31fe7046b72afc58e7599cb");
|
||||
testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllConfident() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "c706ca93b25ff83613cb4e95dcac567c");
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "81fff490c0f59890f1e75dc290833434");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
|
||||
Arrays.asList("df524e98903d96ab9353bee7c16a69de"));
|
||||
Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
|
||||
executeTest("test confidence 1", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testHeterozyosity1() {
|
||||
testHeterozosity( 0.01, "8e61498ca03a8d805372a64c466b3b42" );
|
||||
testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeterozyosity2() {
|
||||
testHeterozosity( 1.0 / 1850, "668d06b5173cf3b97d052726988e1d7b" );
|
||||
testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" );
|
||||
}
|
||||
|
||||
private void testHeterozosity(final double arg, final String md5) {
|
||||
|
|
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("908eb5e21fa39e7fb377cf4a9c4c7835"));
|
||||
Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627"));
|
||||
|
||||
executeTest(String.format("test multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("c814558bb0ed2e19b12e1a2bf4465d52"));
|
||||
Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("3593495aab5f6204c65de0b073a6ff65"));
|
||||
Arrays.asList("50329e15e5139be9e3b643f0b3ba8a53"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX"), spec);
|
||||
}
|
||||
|
|
@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -minIndelCnt 1" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("8b486a098029d5a106b0a37eff541c15"));
|
||||
Arrays.asList("2b85e3bd6bf981afaf7324666740d74b"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
|
||||
}
|
||||
|
|
@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("18efedc50cae2aacaba372265e38310b"));
|
||||
Arrays.asList("a6fd46eff78827060451a62cffd698a7"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("3ff8c7c80a518aa3eb8671a21479de5f"));
|
||||
Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||
}
|
||||
|
||||
|
|
@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("578c0540f4f2052a634a829bcb9cc27d"));
|
||||
Arrays.asList("591332fa0b5b22778cf820ee257049d2"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||
}
|
||||
|
||||
|
|
@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSampleIndels1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e"));
|
||||
Arrays.asList("a4761d7f25e7a62f34494801c98a0da7"));
|
||||
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352"));
|
||||
Arrays.asList("c526c234947482d1cd2ffc5102083a08"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction0() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
|
||||
Arrays.asList("857b8e5df444463ac27f665c4f67fbe2"));
|
||||
Arrays.asList("90adefd39ed67865b0cb275ad0f07383"));
|
||||
executeTest("test minIndelFraction 0.0", spec);
|
||||
}
|
||||
|
||||
|
|
@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction25() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
|
||||
Arrays.asList("81d4c7d9010fd6733b2997bc378e7471"));
|
||||
Arrays.asList("2fded43949e258f8e9f68893c61c1bdd"));
|
||||
executeTest("test minIndelFraction 0.25", spec);
|
||||
}
|
||||
|
||||
|
|
@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testNsInCigar() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1,
|
||||
Arrays.asList("bd7984a374f0ae5d277bd5fc5065f64f"));
|
||||
Arrays.asList("d6d40bacd540a41f305420dfea35e04a"));
|
||||
executeTest("test calling on reads with Ns in CIGAR", spec);
|
||||
}
|
||||
|
||||
|
|
@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("9a7cd58b9e3d5b72608c0d529321deba"));
|
||||
Arrays.asList("c1077662411164182c5f75478344f83d"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "e7fc11baf208a1bca7b462d3148c936e");
|
||||
testReducedCalling("SNP", "f5ccbc96d0d66832dd9b3c5cb6507db4");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "132a4e0ccf9230b5bb4b56c649e2bdd5");
|
||||
testReducedCalling("INDEL", "3c02ee5187933bed44dc416a2e28511f");
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testContaminationDownsampling() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1,
|
||||
Arrays.asList("27dd04159e06d9524fb8a4eef41f96ae"));
|
||||
Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb"));
|
||||
executeTest("test contamination_percentage_to_filter 0.20", spec);
|
||||
}
|
||||
|
||||
|
|
@ -21,17 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e");
|
||||
HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215");
|
||||
HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f");
|
||||
}
|
||||
|
||||
// TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"541aa8291f03ba33bd1ad3d731fd5657");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -42,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -53,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a");
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -64,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939"));
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -91,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9"));
|
||||
Arrays.asList("40bf739fb2b1743642498efe79ea6342"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
|||
haplotypes.add(haplotype);
|
||||
}
|
||||
}
|
||||
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample");
|
||||
final HashSet<String> sampleSet = new HashSet<String>(1);
|
||||
sampleSet.add("myTestSample");
|
||||
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -118,17 +118,24 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
|
||||
|
||||
private static void checkForMaskedUserErrors(final Throwable t) {
|
||||
// masked out of memory error
|
||||
if ( t instanceof OutOfMemoryError )
|
||||
exitSystemWithUserError(new UserException.NotEnoughMemory());
|
||||
// masked user error
|
||||
if ( t instanceof UserException || t instanceof TribbleException )
|
||||
exitSystemWithUserError(new UserException(t.getMessage()));
|
||||
|
||||
// no message means no masked error
|
||||
final String message = t.getMessage();
|
||||
if ( message == null )
|
||||
return;
|
||||
|
||||
// we know what to do about the common "Too many open files" error
|
||||
// too many open files error
|
||||
if ( message.contains("Too many open files") )
|
||||
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||
|
||||
// malformed BAM looks like a SAM file
|
||||
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
|
||||
message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
|
||||
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
|
||||
exitSystemWithSamError(t);
|
||||
|
||||
// can't close tribble index when writing
|
||||
|
|
@ -138,12 +145,10 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
// disk is full
|
||||
if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
|
||||
// masked out of memory error
|
||||
if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
|
||||
exitSystemWithUserError(new UserException.NotEnoughMemory());
|
||||
// masked error wrapped in another one
|
||||
if ( t.getCause() != null )
|
||||
checkForMaskedUserErrors(t.getCause());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.contexts;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -39,10 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class ReferenceContext {
|
||||
final public static boolean UPPERCASE_REFERENCE = true;
|
||||
|
||||
/**
|
||||
* Facilitates creation of new GenomeLocs.
|
||||
*/
|
||||
|
|
@ -59,7 +55,8 @@ public class ReferenceContext {
|
|||
final private GenomeLoc window;
|
||||
|
||||
/**
|
||||
* The bases in the window around the current locus. If null, then bases haven't been fetched yet
|
||||
* The bases in the window around the current locus. If null, then bases haven't been fetched yet.
|
||||
* Bases are always upper cased
|
||||
*/
|
||||
private byte[] basesCache = null;
|
||||
|
||||
|
|
@ -81,7 +78,7 @@ public class ReferenceContext {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getBases();
|
||||
}
|
||||
|
||||
|
|
@ -146,7 +143,9 @@ public class ReferenceContext {
|
|||
private void fetchBasesFromProvider() {
|
||||
if ( basesCache == null ) {
|
||||
basesCache = basesProvider.getBases();
|
||||
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache);
|
||||
|
||||
// must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time
|
||||
assert BaseUtils.isUpperCase(basesCache);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -194,6 +193,7 @@ public class ReferenceContext {
|
|||
/**
|
||||
* All the bases in the window from the current base forward to the end of the window.
|
||||
*/
|
||||
@Ensures({"result != null", "result.length > 0"})
|
||||
public byte[] getForwardBases() {
|
||||
final byte[] bases = getBases();
|
||||
final int mid = locus.getStart() - window.getStart();
|
||||
|
|
|
|||
|
|
@ -198,8 +198,6 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
|
|||
|
||||
source.close();
|
||||
file.delete(); // this should be last to aid in debugging when the process fails
|
||||
} catch (UserException e) {
|
||||
throw new ReviewedStingException("BUG: intermediate file " + file + " is malformed, got error while reading", e);
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -271,7 +271,18 @@ public class GATKReport {
|
|||
* @return a simplified GATK report
|
||||
*/
|
||||
public static GATKReport newSimpleReport(final String tableName, final String... columns) {
|
||||
GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.length);
|
||||
return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #newSimpleReport(String, String...) but with a customized description
|
||||
* @param tableName
|
||||
* @param desc
|
||||
* @param columns
|
||||
* @return
|
||||
*/
|
||||
public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) {
|
||||
GATKReportTable table = new GATKReportTable(tableName, desc, columns.length);
|
||||
|
||||
for (String column : columns) {
|
||||
table.addColumn(column, "");
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
public enum GATKReportVersion {
|
||||
/**
|
||||
|
|
@ -79,6 +80,9 @@ public enum GATKReportVersion {
|
|||
* @return The version as an enum.
|
||||
*/
|
||||
public static GATKReportVersion fromHeader(String header) {
|
||||
if ( header == null )
|
||||
throw new UserException.BadInput("The GATK report has no version specified in the header");
|
||||
|
||||
if (header.startsWith("##:GATKReport.v0.1 "))
|
||||
return GATKReportVersion.V0_1;
|
||||
|
||||
|
|
@ -91,6 +95,6 @@ public enum GATKReportVersion {
|
|||
if (header.startsWith("#:GATKReport.v1.1"))
|
||||
return GATKReportVersion.V1_1;
|
||||
|
||||
throw new ReviewedStingException("Unknown GATK report version in header: " + header);
|
||||
throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
|
|
@ -46,99 +45,126 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
T sum) {
|
||||
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
|
||||
final LocusView locusView = new AllLocusView(dataProvider);
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
GenomeLoc location = locus.getLocation();
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
if(prevLoc != null) {
|
||||
// fill in the active / inactive labels from the stop of the previous location to the start of this location
|
||||
// TODO refactor to separate function
|
||||
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
|
||||
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
|
||||
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
|
||||
profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ));
|
||||
}
|
||||
}
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
// Note that this must occur before we leave because we are outside the intervals because
|
||||
// reads may occur outside our intervals but overlap them in the future
|
||||
// TODO -- this whole HashSet logic should be changed to a linked list of reads with
|
||||
// TODO -- subsequent pass over them to find the ones overlapping the active regions
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
|
||||
profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
}
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||
}
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
// skip this location -- it's not part of our engine intervals
|
||||
if ( outsideEngineIntervals(location) )
|
||||
continue;
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
|
||||
// we've move across some interval boundary, restart profile
|
||||
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
if ( ! profile.isEmpty() )
|
||||
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the loc outside of the intervals being requested for processing by the GATK?
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
private boolean outsideEngineIntervals(final GenomeLoc loc) {
|
||||
return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
* add these blocks of work to the work queue
|
||||
* band-pass filter the list of isActive probabilities and turn into active regions
|
||||
*
|
||||
* @param profile
|
||||
* @param activeRegions
|
||||
* @param activeRegionExtension
|
||||
* @param maxRegionSize
|
||||
* @return
|
||||
*/
|
||||
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
|
||||
final List<ActiveRegion> activeRegions,
|
||||
final int activeRegionExtension,
|
||||
final int maxRegionSize) {
|
||||
if ( profile.isEmpty() )
|
||||
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
|
||||
|
||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
|
||||
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -150,7 +176,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
final RefMetaDataTracker tracker, final ReferenceContext refContext,
|
||||
final AlignmentContext locus, final GenomeLoc location) {
|
||||
if ( walker.hasPresetActiveRegions() ) {
|
||||
return new ActivityProfileResult(walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
|
||||
return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
|
||||
} else {
|
||||
return walker.isActive( tracker, refContext, locus );
|
||||
}
|
||||
|
|
@ -250,30 +276,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// engine interaction code
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
|
||||
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
|
|||
final List<Double> refQuals, final List<Double> altQuals) {
|
||||
|
||||
if (pileup != null && likelihoodMap == null) {
|
||||
// no per-read likelihoods available:
|
||||
// old UG snp-only path through the annotations
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
|
||||
|
|
@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
|
|||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
// BUGBUG: There needs to be a comparable isUsableBase check here
|
||||
if (a.isNoCall())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
refQuals.add((double)el.getKey().getMappingQuality());
|
||||
else if (allAlleles.contains(a))
|
||||
altQuals.add((double)el.getKey().getMappingQuality());
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
ReadBackedPileup pileup = null;
|
||||
|
||||
|
||||
if (stratifiedContexts != null) {
|
||||
if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context != null )
|
||||
pileup = context.getBasePileup();
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
final List<Double> refQuals, final List<Double> altQuals) {
|
||||
|
||||
if (alleleLikelihoodMap == null) {
|
||||
// use fast SNP-based version if we don't have per-read allele likelihoods
|
||||
// use old UG SNP-based version if we don't have per-read allele likelihoods
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( isUsableBase(p) ) {
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ import java.util.*;
|
|||
@Allows(value={DataSource.READS, DataSource.REFERENCE})
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
@By(DataSource.REFERENCE)
|
||||
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible {
|
||||
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible, TreeReducible<Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
|
||||
/**
|
||||
* We want reads that span deletions
|
||||
*
|
||||
|
|
@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of loci processed.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return the new number of loci processed.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) { return value + sum; }
|
||||
|
||||
@Override
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -277,8 +277,12 @@ public class VariantAnnotatorEngine {
|
|||
if ( expression.fieldName.equals("ID") ) {
|
||||
if ( vc.hasID() )
|
||||
infoAnnotations.put(expression.fullName, vc.getID());
|
||||
} else if (expression.fieldName.equals("ALT")) {
|
||||
infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString());
|
||||
|
||||
} else if ( vc.hasAttribute(expression.fieldName) ) {
|
||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
*/
|
||||
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
|
||||
|
||||
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead);
|
||||
final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
|
||||
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
|
||||
|
||||
RecalUtils.parsePlatformForRead(read, RAC);
|
||||
|
|
@ -268,16 +268,25 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
}
|
||||
|
||||
protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
|
||||
final int BUFFER_SIZE = 0;
|
||||
final int readLength = read.getReadBases().length;
|
||||
final boolean[] knownSites = new boolean[readLength];
|
||||
Arrays.fill(knownSites, false);
|
||||
for( final Feature f : features ) {
|
||||
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
|
||||
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; }
|
||||
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
featureStartOnRead = 0;
|
||||
}
|
||||
|
||||
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
|
||||
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; }
|
||||
Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true);
|
||||
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
featureEndOnRead = readLength;
|
||||
}
|
||||
|
||||
if( featureStartOnRead > readLength ) {
|
||||
featureStartOnRead = featureEndOnRead = readLength;
|
||||
}
|
||||
|
||||
Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
|
||||
}
|
||||
return knownSites;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -75,8 +75,9 @@ public class RecalibrationArgumentCollection {
|
|||
|
||||
/**
|
||||
* If not provided, then a temporary file is created and then deleted upon completion.
|
||||
* For advanced users only.
|
||||
*/
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
|
||||
public File RECAL_CSV_FILE = null;
|
||||
|
||||
|
|
@ -101,13 +102,10 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
|
||||
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
public boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
|
|
@ -138,6 +136,13 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
|
||||
public int INDELS_CONTEXT_SIZE = 3;
|
||||
|
||||
/**
|
||||
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
|
||||
* This argument is ignored if the Cycle covariate is not used.
|
||||
*/
|
||||
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
|
||||
public int MAXIMUM_CYCLE_VALUE = 500;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
*/
|
||||
|
|
@ -175,9 +180,15 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
|
||||
public String BINARY_TAG_NAME = null;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
|
||||
OutputFormat outputFormat;
|
||||
OutputFormat outputFormat = OutputFormat.BED;
|
||||
|
||||
public enum OutputFormat {
|
||||
/**
|
||||
|
|
@ -297,7 +297,7 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
|
||||
return String.format("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart()-1, loc.getStop(), state);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
|
|
@ -20,6 +19,21 @@ import java.util.Map;
|
|||
*/
|
||||
public class CoverageUtils {
|
||||
|
||||
public enum CountPileupType {
|
||||
/**
|
||||
* Count all reads independently (even if from the same fragment).
|
||||
*/
|
||||
COUNT_READS,
|
||||
/**
|
||||
* Count all fragments (even if the reads that compose the fragment are not consistent at that base).
|
||||
*/
|
||||
COUNT_FRAGMENTS,
|
||||
/**
|
||||
* Count all fragments (but only if the reads that compose the fragment are consistent at that base).
|
||||
*/
|
||||
COUNT_FRAGMENTS_REQUIRE_SAME_BASE
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context
|
||||
* as an array of ints, indexed by the index fields of BaseUtils
|
||||
|
|
@ -64,10 +78,10 @@ public class CoverageUtils {
|
|||
}
|
||||
|
||||
public static Map<DoCOutputType.Partition,Map<String,int[]>>
|
||||
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection<DoCOutputType.Partition> types) {
|
||||
getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection<DoCOutputType.Partition> types) {
|
||||
|
||||
Map<DoCOutputType.Partition,Map<String,int[]>> countsByIDByType = new HashMap<DoCOutputType.Partition,Map<String,int[]>>();
|
||||
Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ);
|
||||
Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType);
|
||||
for (DoCOutputType.Partition t : types ) {
|
||||
// iterate through the read group counts and build the type associations
|
||||
for ( Map.Entry<SAMReadGroupRecord,int[]> readGroupCountEntry : countsByRG.entrySet() ) {
|
||||
|
|
@ -95,31 +109,95 @@ public class CoverageUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
|
||||
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) {
|
||||
Map<SAMReadGroupRecord, int[]> countsByRG = new HashMap<SAMReadGroupRecord,int[]>();
|
||||
for ( PileupElement e : context.getBasePileup() ) {
|
||||
if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) {
|
||||
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
|
||||
if ( ! countsByRG.keySet().contains(readGroup) ) {
|
||||
countsByRG.put(readGroup,new int[6]);
|
||||
updateCounts(countsByRG.get(readGroup),e);
|
||||
} else {
|
||||
updateCounts(countsByRG.get(readGroup),e);
|
||||
|
||||
List<PileupElement> countPileup = new LinkedList<PileupElement>();
|
||||
FragmentCollection<PileupElement> fpile;
|
||||
|
||||
switch (countType) {
|
||||
|
||||
case COUNT_READS:
|
||||
for (PileupElement e : context.getBasePileup())
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
countPileup.add(e);
|
||||
break;
|
||||
|
||||
case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters:
|
||||
fpile = context.getBasePileup().getStartSortedPileup().toFragments();
|
||||
|
||||
for (PileupElement e : fpile.getSingletonReads())
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
countPileup.add(e);
|
||||
|
||||
for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
|
||||
// iterate over all elements in fragment:
|
||||
for (PileupElement e : overlappingPair) {
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) {
|
||||
countPileup.add(e); // add the first passing element per fragment
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case COUNT_FRAGMENTS_REQUIRE_SAME_BASE:
|
||||
fpile = context.getBasePileup().getStartSortedPileup().toFragments();
|
||||
|
||||
for (PileupElement e : fpile.getSingletonReads())
|
||||
if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
countPileup.add(e);
|
||||
|
||||
for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
|
||||
PileupElement firstElem = null;
|
||||
PileupElement addElem = null;
|
||||
|
||||
// iterate over all elements in fragment:
|
||||
for (PileupElement e : overlappingPair) {
|
||||
if (firstElem == null)
|
||||
firstElem = e;
|
||||
else if (e.getBase() != firstElem.getBase()) {
|
||||
addElem = null;
|
||||
break;
|
||||
}
|
||||
|
||||
// will add the first passing element per base-consistent fragment:
|
||||
if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
|
||||
addElem = e;
|
||||
}
|
||||
|
||||
if (addElem != null)
|
||||
countPileup.add(addElem);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new UserException("Must use valid CountPileupType");
|
||||
}
|
||||
|
||||
for (PileupElement e : countPileup) {
|
||||
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
|
||||
if (!countsByRG.keySet().contains(readGroup))
|
||||
countsByRG.put(readGroup, new int[6]);
|
||||
|
||||
updateCounts(countsByRG.get(readGroup), e);
|
||||
}
|
||||
|
||||
return countsByRG;
|
||||
}
|
||||
|
||||
private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
|
||||
return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ));
|
||||
}
|
||||
|
||||
private static void updateCounts(int[] counts, PileupElement e) {
|
||||
if ( e.isDeletion() ) {
|
||||
counts[BaseUtils.DELETION_INDEX]++;
|
||||
counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount();
|
||||
} else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) {
|
||||
counts[BaseUtils.NO_CALL_INDEX]++;
|
||||
counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount();
|
||||
} else {
|
||||
try {
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount();
|
||||
} catch (ArrayIndexOutOfBoundsException exc) {
|
||||
throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
|
|||
int minMappingQuality = -1;
|
||||
@Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
|
||||
int maxMappingQuality = Integer.MAX_VALUE;
|
||||
|
||||
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false)
|
||||
byte minBaseQuality = -1;
|
||||
@Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
|
||||
byte maxBaseQuality = Byte.MAX_VALUE;
|
||||
|
||||
@Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false)
|
||||
CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS;
|
||||
|
||||
/**
|
||||
* Instead of reporting depth, report the base pileup at each locus
|
||||
*/
|
||||
|
|
@ -373,7 +377,7 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
|
|||
//System.out.printf("\t[log]\t%s",ref.getLocus());
|
||||
}
|
||||
|
||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
|
||||
return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
|
|||
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
|
||||
|
||||
// note the linear probability scale
|
||||
return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1));
|
||||
return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,12 @@ import java.util.List;
|
|||
* <p>
|
||||
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
|
||||
*
|
||||
* The output format can be partially controlled using the provided command-line arguments.
|
||||
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
|
||||
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
|
||||
* separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* Several important notes:
|
||||
* 1) if there are multiple variants that start at a site, it chooses one of them randomly.
|
||||
* 2) when there are overlapping indels (but with different start positions) only the first will be chosen.
|
||||
|
|
|
|||
|
|
@ -190,6 +190,10 @@ public class UnifiedGenotyperEngine {
|
|||
final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
|
||||
if ( vc != null )
|
||||
results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
|
||||
// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set
|
||||
// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
|
||||
// results.add(generateEmptyContext(tracker, refContext, null, rawContext));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
|
|
@ -236,6 +236,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
* then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the
|
||||
* value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output
|
||||
* file name (2nd column) for each input file name (1st column).
|
||||
*
|
||||
* Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing).
|
||||
*/
|
||||
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
|
||||
protected String N_WAY_OUT = null;
|
||||
|
|
@ -274,7 +276,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
protected String OUT_SNPS = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// the intervals input by the user
|
||||
private Iterator<GenomeLoc> intervals = null;
|
||||
|
|
@ -1601,7 +1603,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
public List<GATKSAMRecord> getReads() { return reads; }
|
||||
|
||||
public byte[] getReference(IndexedFastaSequenceFile referenceReader) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getReference(CachingIndexedFastaSequenceFile referenceReader) {
|
||||
// set up the reference if we haven't done so yet
|
||||
if ( reference == null ) {
|
||||
// first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read)
|
||||
|
|
@ -1609,7 +1612,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
int padRight = Math.min(loc.getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(loc.getContig()).getSequenceLength());
|
||||
loc = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), padLeft, padRight);
|
||||
reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
|
||||
StringUtil.toUpperCase(reference);
|
||||
}
|
||||
|
||||
return reference;
|
||||
|
|
|
|||
|
|
@ -183,6 +183,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
|
|||
@Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
|
||||
private boolean keepSitesWithAC0 = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
|
||||
private int numSamplesFromArgument = 0;
|
||||
|
||||
/**
|
||||
* If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
|
||||
* variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
|
||||
|
|
@ -589,6 +593,14 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
|
|||
public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
|
||||
public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
|
||||
|
||||
public int getNumberOfSamplesForEvaluation() {
|
||||
if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty())
|
||||
return sampleNamesForEvaluation.size();
|
||||
else {
|
||||
return numSamplesFromArgument;
|
||||
}
|
||||
|
||||
}
|
||||
public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
|
||||
|
||||
public List<RodBinding<VariantContext>> getComps() { return comps; }
|
||||
|
|
|
|||
|
|
@ -1,249 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* @author rpoplin
|
||||
* @since Apr 6, 2010
|
||||
*/
|
||||
|
||||
//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score")
|
||||
@Deprecated
|
||||
public class VariantQualityScore {
|
||||
// TODO - this should really be a stratification
|
||||
|
||||
// public class VariantQualityScore extends VariantEvaluator {
|
||||
//
|
||||
// // a mapping from quality score histogram bin to Ti/Tv ratio
|
||||
// @DataPoint(description = "the Ti/Tv ratio broken out by variant quality")
|
||||
// TiTvStats titvStats = null;
|
||||
//
|
||||
// @DataPoint(description = "average variant quality for each allele count")
|
||||
// AlleleCountStats alleleCountStats = null;
|
||||
//
|
||||
// static class TiTvStats extends TableType {
|
||||
// final static int NUM_BINS = 20;
|
||||
// final HashMap<Integer, Pair<Long,Long>> qualByIsTransition = new HashMap<Integer, Pair<Long,Long>>(); // A hashMap holds all the qualities until we are able to bin them appropriately
|
||||
// final long transitionByQuality[] = new long[NUM_BINS];
|
||||
// final long transversionByQuality[] = new long[NUM_BINS];
|
||||
// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out
|
||||
//
|
||||
// public Object[] getRowKeys() {
|
||||
// return new String[]{"sample"};
|
||||
// }
|
||||
//
|
||||
// public Object[] getColumnKeys() {
|
||||
// final String columnKeys[] = new String[NUM_BINS];
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// columnKeys[iii] = "titvBin" + iii;
|
||||
// }
|
||||
// return columnKeys;
|
||||
// }
|
||||
//
|
||||
// public String getCell(int x, int y) {
|
||||
// return String.valueOf(titvByQuality[y]);
|
||||
// }
|
||||
//
|
||||
// public String toString() {
|
||||
// StringBuffer returnString = new StringBuffer();
|
||||
// // output the ti/tv array
|
||||
// returnString.append("titvByQuality: ");
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// returnString.append(titvByQuality[iii]);
|
||||
// returnString.append(" ");
|
||||
// }
|
||||
// return returnString.toString();
|
||||
// }
|
||||
//
|
||||
// public void incrValue( final double qual, final boolean isTransition ) {
|
||||
// final Integer qualKey = Math.round((float) qual);
|
||||
// final long numTransition = (isTransition ? 1L : 0L);
|
||||
// final long numTransversion = (isTransition ? 0L : 1L);
|
||||
// if( qualByIsTransition.containsKey(qualKey) ) {
|
||||
// Pair<Long,Long> transitionPair = qualByIsTransition.get(qualKey);
|
||||
// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion);
|
||||
// qualByIsTransition.put(qualKey, transitionPair);
|
||||
// } else {
|
||||
// qualByIsTransition.put(qualKey, new Pair<Long,Long>(numTransition,numTransversion));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// public void organizeTiTvTables() {
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// transitionByQuality[iii] = 0L;
|
||||
// transversionByQuality[iii] = 0L;
|
||||
// titvByQuality[iii] = 0.0;
|
||||
// }
|
||||
//
|
||||
// int maxQual = 0;
|
||||
//
|
||||
// // Calculate the maximum quality score in order to normalize and histogram
|
||||
// for( final Integer qual : qualByIsTransition.keySet() ) {
|
||||
// if( qual > maxQual ) {
|
||||
// maxQual = qual;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1));
|
||||
//
|
||||
// for( final Integer qual : qualByIsTransition.keySet() ) {
|
||||
// final int index = (int)Math.floor( ((double) qual) / binSize );
|
||||
// if( index >= 0 ) { // BUGBUG: why is there overflow here?
|
||||
// Pair<Long,Long> transitionPair = qualByIsTransition.get(qual);
|
||||
// transitionByQuality[index] += transitionPair.getFirst();
|
||||
// transversionByQuality[index] += transitionPair.getSecond();
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
|
||||
// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
|
||||
// } else {
|
||||
// titvByQuality[iii] = 0.0;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// class AlleleCountStats extends TableType {
|
||||
// final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
|
||||
// final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
|
||||
//
|
||||
// public Object[] getRowKeys() {
|
||||
// final int NUM_BINS = qualityListMap.keySet().size();
|
||||
// final String rowKeys[] = new String[NUM_BINS];
|
||||
// int iii = 0;
|
||||
// for( final Integer key : qualityListMap.keySet() ) {
|
||||
// rowKeys[iii] = "AC" + key;
|
||||
// iii++;
|
||||
// }
|
||||
// return rowKeys;
|
||||
//
|
||||
// }
|
||||
//
|
||||
// public Object[] getColumnKeys() {
|
||||
// return new String[]{"alleleCount","avgQual"};
|
||||
// }
|
||||
//
|
||||
// public String getCell(int x, int y) {
|
||||
// int iii = 0;
|
||||
// for( final Integer key : qualityListMap.keySet() ) {
|
||||
// if(iii == x) {
|
||||
// if(y == 0) { return String.valueOf(key); }
|
||||
// else { return String.valueOf(qualityMap.get(key)); }
|
||||
// }
|
||||
// iii++;
|
||||
// }
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// public String toString() {
|
||||
// String returnString = "";
|
||||
// // output the quality map
|
||||
// returnString += "AlleleCountStats: ";
|
||||
// //for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
// // returnString += titvByQuality[iii] + " ";
|
||||
// //}
|
||||
// return returnString;
|
||||
// }
|
||||
//
|
||||
// public void incrValue( final double qual, final int alleleCount ) {
|
||||
// ArrayList<Double> list = qualityListMap.get(alleleCount);
|
||||
// if(list==null) { list = new ArrayList<Double>(); }
|
||||
// list.add(qual);
|
||||
// qualityListMap.put(alleleCount, list);
|
||||
// }
|
||||
//
|
||||
// public void organizeAlleleCountTables() {
|
||||
// for( final Integer key : qualityListMap.keySet() ) {
|
||||
// final ArrayList<Double> list = qualityListMap.get(key);
|
||||
// double meanQual = 0.0;
|
||||
// final double numQuals = (double)list.size();
|
||||
// for( Double qual : list ) {
|
||||
// meanQual += qual / numQuals;
|
||||
// }
|
||||
// qualityMap.put(key, meanQual);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// //public VariantQualityScore(VariantEvalWalker parent) {
|
||||
// //super(parent);
|
||||
// //}
|
||||
//
|
||||
// public String getName() {
|
||||
// return "VariantQualityScore";
|
||||
// }
|
||||
//
|
||||
// public int getComparisonOrder() {
|
||||
// return 1; // we only need to see each eval track
|
||||
// }
|
||||
//
|
||||
// public String toString() {
|
||||
// return getName();
|
||||
// }
|
||||
//
|
||||
// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// final String interesting = null;
|
||||
//
|
||||
// if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
|
||||
// if( titvStats == null ) { titvStats = new TiTvStats(); }
|
||||
// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
|
||||
//
|
||||
// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
|
||||
// int alternateAlleleCount = 0;
|
||||
// for (final Allele a : eval.getAlternateAlleles()) {
|
||||
// alternateAlleleCount += eval.getCalledChrCount(a);
|
||||
// }
|
||||
// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
|
||||
// }
|
||||
//
|
||||
// return interesting; // This module doesn't capture any interesting sites, so return null
|
||||
// }
|
||||
//
|
||||
// public void finalizeEvaluation() {
|
||||
// if( titvStats != null ) {
|
||||
// titvStats.organizeTiTvTables();
|
||||
// }
|
||||
// if( alleleCountStats != null ) {
|
||||
// alleleCountStats.organizeAlleleCountTables();
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
|
@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier {
|
|||
|
||||
// There are ploidy x n sample chromosomes
|
||||
// TODO -- generalize to handle multiple ploidy
|
||||
nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy();
|
||||
nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy();
|
||||
if ( nchrom < 2 )
|
||||
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample");
|
||||
|
||||
|
|
|
|||
|
|
@ -151,14 +151,6 @@ import java.util.*;
|
|||
* -mvq 50 \
|
||||
* -o violations.vcf
|
||||
*
|
||||
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -number 1000
|
||||
*
|
||||
* Creating a set with 50% of the total number of variants in the variant VCF:
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
@ -659,7 +651,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED)));
|
||||
}
|
||||
|
||||
private boolean haveSameGenotypes(Genotype g1, Genotype g2) {
|
||||
private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) {
|
||||
if ( g1 == null || g2 == null )
|
||||
return false;
|
||||
|
||||
if ((g1.isCalled() && g2.isFiltered()) ||
|
||||
(g2.isCalled() && g1.isFiltered()) ||
|
||||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
|
||||
|
|
|
|||
|
|
@ -24,33 +24,6 @@ public class BaseUtils {
|
|||
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
|
||||
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
|
||||
|
||||
public enum Base {
|
||||
A('A', 0),
|
||||
C('C', 1),
|
||||
G('G', 2),
|
||||
T('T', 3);
|
||||
|
||||
byte b;
|
||||
int index;
|
||||
|
||||
private Base(char base, int index) {
|
||||
this.b = (byte) base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public byte getBase() { return b; }
|
||||
|
||||
public char getBaseAsChar() { return (char) b; }
|
||||
|
||||
public int getIndex() { return index; }
|
||||
|
||||
public boolean sameBase(byte o) { return b == o; }
|
||||
|
||||
public boolean sameBase(char o) { return b == (byte) o; }
|
||||
|
||||
public boolean sameBase(int i) { return index == i; }
|
||||
}
|
||||
|
||||
static private final int[] baseIndexMap = new int[256];
|
||||
static {
|
||||
Arrays.fill(baseIndexMap, -1);
|
||||
|
|
@ -130,6 +103,17 @@ public class BaseUtils {
|
|||
return false;
|
||||
}
|
||||
|
||||
public static boolean isUpperCase(final byte[] bases) {
|
||||
for ( byte base : bases )
|
||||
if ( ! isUpperCase(base) )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean isUpperCase(final byte base) {
|
||||
return base >= 'A' && base <= 'Z';
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a IUPAC nucleotide code to a pair of bases
|
||||
*
|
||||
|
|
@ -271,59 +255,6 @@ public class BaseUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a base index to a base index representing its cross-talk partner
|
||||
*
|
||||
* @param baseIndex 0, 1, 2, 3
|
||||
* @return 1, 0, 3, 2, or -1 if the index can't be understood
|
||||
*/
|
||||
static public int crossTalkPartnerIndex(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0:
|
||||
return 1; // A -> C
|
||||
case 1:
|
||||
return 0; // C -> A
|
||||
case 2:
|
||||
return 3; // G -> T
|
||||
case 3:
|
||||
return 2; // T -> G
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a base to the base representing its cross-talk partner
|
||||
*
|
||||
* @param base [AaCcGgTt]
|
||||
* @return C, A, T, G, or '.' if the base can't be understood
|
||||
*/
|
||||
@Deprecated
|
||||
static public char crossTalkPartnerBase(char base) {
|
||||
return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement of a base index.
|
||||
*
|
||||
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
||||
* @return the complementary base index
|
||||
*/
|
||||
static public byte complementIndex(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0:
|
||||
return 3; // a -> t
|
||||
case 1:
|
||||
return 2; // c -> g
|
||||
case 2:
|
||||
return 1; // g -> c
|
||||
case 3:
|
||||
return 0; // t -> a
|
||||
default:
|
||||
return -1; // wtf?
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
||||
*
|
||||
|
|
@ -350,7 +281,7 @@ public class BaseUtils {
|
|||
}
|
||||
|
||||
@Deprecated
|
||||
static public char simpleComplement(char base) {
|
||||
static private char simpleComplement(char base) {
|
||||
return (char) simpleComplement((byte) base);
|
||||
}
|
||||
|
||||
|
|
@ -370,22 +301,6 @@ public class BaseUtils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
||||
*
|
||||
* @param bases the byte array of bases
|
||||
* @return the complement of the base byte array
|
||||
*/
|
||||
static public byte[] simpleComplement(byte[] bases) {
|
||||
byte[] rcbases = new byte[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = simpleComplement(bases[i]);
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse complement a char array of bases
|
||||
*
|
||||
|
|
@ -403,23 +318,6 @@ public class BaseUtils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a char array of bases
|
||||
*
|
||||
* @param bases the char array of bases
|
||||
* @return the complement of the base char array
|
||||
*/
|
||||
@Deprecated
|
||||
static public char[] simpleComplement(char[] bases) {
|
||||
char[] rcbases = new char[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = simpleComplement(bases[i]);
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
|
|
@ -431,17 +329,6 @@ public class BaseUtils {
|
|||
return new String(simpleReverseComplement(bases.getBytes()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
* @param bases the String of bases
|
||||
* @return the complement of the String
|
||||
*/
|
||||
@Deprecated
|
||||
static public String simpleComplement(String bases) {
|
||||
return new String(simpleComplement(bases.getBytes()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the uppercased version of the bases
|
||||
*
|
||||
|
|
@ -543,82 +430,4 @@ public class BaseUtils {
|
|||
|
||||
return randomBaseIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a random base (A, C, G, T).
|
||||
*
|
||||
* @return a random base (A, C, G, T)
|
||||
*/
|
||||
@Deprecated
|
||||
static public byte getRandomBase() {
|
||||
return getRandomBase('.');
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a random base, excluding some base.
|
||||
*
|
||||
* @param excludeBase the base to exclude
|
||||
* @return a random base, excluding the one specified (A, C, G, T)
|
||||
*/
|
||||
@Deprecated
|
||||
static public byte getRandomBase(char excludeBase) {
|
||||
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
||||
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
|
||||
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
||||
* of 2 (it has a period of 4 as well), and so does
|
||||
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
||||
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
|
||||
* or if specified minPeriod is greater than the sequence length.
|
||||
*
|
||||
* @param seq
|
||||
* @return
|
||||
*/
|
||||
public static int sequencePeriod(byte[] seq, int minPeriod) {
|
||||
int period = (minPeriod > seq.length ? seq.length : minPeriod);
|
||||
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
||||
// until we find correct period
|
||||
|
||||
for (int pos = period; pos < seq.length; pos++) {
|
||||
|
||||
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
||||
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
||||
|
||||
if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
|
||||
|
||||
// period we have been trying so far does not work.
|
||||
// two possibilities:
|
||||
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
||||
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
||||
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
||||
// pos will be autoincremented and we will be checking next base
|
||||
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
||||
// hence we should first check if it matches the first base of the sequence, and to do that
|
||||
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
||||
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
||||
// on the next loop re-entrance after pos is autoincremented)
|
||||
if (offset == 0)
|
||||
period = pos + 1;
|
||||
else
|
||||
period = pos--;
|
||||
|
||||
}
|
||||
}
|
||||
return period;
|
||||
}
|
||||
}
|
||||
|
||||
/* code snippet for testing sequencePeriod():
|
||||
*
|
||||
* String str = "CCTTG";
|
||||
int p = 0;
|
||||
System.out.print("Periods of " + str +" are:");
|
||||
while ( p < str.length() ) {
|
||||
p = sequencePeriod(str, p+1);
|
||||
System.out.print(" "+p);
|
||||
}
|
||||
System.out.println(); System.exit(1);
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() ));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether this genome loc starts at the same position as that.
|
||||
*
|
||||
* i.e., do this and that have the same contig and the same start position
|
||||
*
|
||||
* @param that genome loc to compare to
|
||||
* @return true if this and that have the same contig and the same start position
|
||||
*/
|
||||
@Requires("that != null")
|
||||
public final boolean startsAt( GenomeLoc that ) {
|
||||
int comparison = this.compareContigs(that);
|
||||
return comparison == 0 && this.getStart() == that.getStart();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether any portion of this contig is before that contig.
|
||||
* @param that Other contig to test.
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
// our private storage for the GenomeLoc's
|
||||
private List<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
|
||||
|
||||
// cache this to make overlap checking much more efficient
|
||||
private int previousOverlapSearchIndex = -1;
|
||||
|
||||
/** default constructor */
|
||||
public GenomeLocSortedSet(GenomeLocParser parser) {
|
||||
this.genomeLocParser = parser;
|
||||
|
|
@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
* Return the number of bps before loc in the sorted set
|
||||
*
|
||||
* @param loc the location before which we are counting bases
|
||||
* @return
|
||||
* @return the number of base pairs over all previous intervals
|
||||
*/
|
||||
public long sizeBeforeLoc(GenomeLoc loc) {
|
||||
long s = 0;
|
||||
|
|
@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
if ( e.isBefore(loc) )
|
||||
s += e.size();
|
||||
else if ( e.isPast(loc) )
|
||||
; // don't do anything
|
||||
break; // we are done
|
||||
else // loc is inside of s
|
||||
s += loc.getStart() - e.getStart();
|
||||
}
|
||||
|
|
@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
* Determine if the given loc overlaps any loc in the sorted set
|
||||
*
|
||||
* @param loc the location to test
|
||||
* @return
|
||||
* @return trip if the location overlaps any loc
|
||||
*/
|
||||
public boolean overlaps(final GenomeLoc loc) {
|
||||
for(final GenomeLoc e : mArray) {
|
||||
if(e.overlapsP(loc)) {
|
||||
return true;
|
||||
}
|
||||
// edge condition
|
||||
if ( mArray.isEmpty() )
|
||||
return false;
|
||||
|
||||
// use the cached version first
|
||||
if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) )
|
||||
return true;
|
||||
|
||||
// update the cached index
|
||||
previousOverlapSearchIndex = Collections.binarySearch(mArray, loc);
|
||||
|
||||
// if it matches an interval exactly, we are done
|
||||
if ( previousOverlapSearchIndex >= 0 )
|
||||
return true;
|
||||
|
||||
// check whether it overlaps the interval before or after the insertion point
|
||||
previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2);
|
||||
return overlapsAtOrImmediatelyAfterCachedIndex(loc, false);
|
||||
}
|
||||
|
||||
private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) {
|
||||
// check the cached entry
|
||||
if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) )
|
||||
return true;
|
||||
|
||||
// check the entry after the cached entry since we may have moved to it
|
||||
boolean returnValue = false;
|
||||
if ( previousOverlapSearchIndex < mArray.size() - 1 ) {
|
||||
returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc);
|
||||
if ( updateCachedIndex )
|
||||
previousOverlapSearchIndex++;
|
||||
}
|
||||
return false;
|
||||
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
mArray.add(e);
|
||||
return true;
|
||||
} else {
|
||||
int loc = Collections.binarySearch(mArray,e);
|
||||
final int loc = Collections.binarySearch(mArray,e);
|
||||
if (loc >= 0) {
|
||||
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -49,7 +49,9 @@ public class Haplotype {
|
|||
private int alignmentStartHapwrtRef;
|
||||
public int leftBreakPoint = 0;
|
||||
public int rightBreakPoint = 0;
|
||||
|
||||
private Allele artificialAllele = null;
|
||||
private int artificialAllelePosition = -1;
|
||||
|
||||
/**
|
||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
||||
*
|
||||
|
|
@ -71,6 +73,12 @@ public class Haplotype {
|
|||
this(bases, 0);
|
||||
}
|
||||
|
||||
protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) {
|
||||
this(bases, 0);
|
||||
this.artificialAllele = artificialAllele;
|
||||
this.artificialAllelePosition = artificialAllelePosition;
|
||||
}
|
||||
|
||||
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
|
||||
this(bases);
|
||||
this.genomeLocation = loc;
|
||||
|
|
@ -171,8 +179,25 @@ public class Haplotype {
|
|||
this.cigar = cigar;
|
||||
}
|
||||
|
||||
public boolean isArtificialHaplotype() {
|
||||
return artificialAllele != null;
|
||||
}
|
||||
|
||||
public Allele getArtificialAllele() {
|
||||
return artificialAllele;
|
||||
}
|
||||
|
||||
public int getArtificialAllelePosition() {
|
||||
return artificialAllelePosition;
|
||||
}
|
||||
|
||||
public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) {
|
||||
this.artificialAllele = artificialAllele;
|
||||
this.artificialAllelePosition = artificialAllelePosition;
|
||||
}
|
||||
|
||||
@Requires({"refInsertLocation >= 0"})
|
||||
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
|
||||
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
|
||||
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
|
||||
final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
|
||||
if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
|
||||
|
|
@ -182,7 +207,7 @@ public class Haplotype {
|
|||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
|
||||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
|
||||
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
|
||||
return new Haplotype(newHaplotypeBases);
|
||||
return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation);
|
||||
}
|
||||
|
||||
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
|
||||
|
|
|
|||
|
|
@ -293,6 +293,10 @@ public class Utils {
|
|||
}
|
||||
}
|
||||
|
||||
public static <T> String join(final String separator, final T ... objects) {
|
||||
return join(separator, Arrays.asList(objects));
|
||||
}
|
||||
|
||||
public static String dupString(char c, int nCopies) {
|
||||
char[] chars = new char[nCopies];
|
||||
Arrays.fill(chars, c);
|
||||
|
|
@ -701,11 +705,13 @@ public class Utils {
|
|||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||
for ( SAMProgramRecord record : oldRecords )
|
||||
if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS )
|
||||
if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS )
|
||||
newRecords.add(record);
|
||||
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
if (programRecord != null) {
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
}
|
||||
return header;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
|
||||
public ArrayList<GATKSAMRecord> getReads() { return reads; }
|
||||
|
||||
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) {
|
||||
return getActiveRegionReference(referenceReader, 0);
|
||||
}
|
||||
|
||||
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference( referenceReader, padding, extendedLoc );
|
||||
}
|
||||
|
||||
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) {
|
||||
return getFullReference(referenceReader, 0);
|
||||
}
|
||||
|
||||
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
|
||||
return getReference( referenceReader, padding, fullExtentReferenceLoc );
|
||||
}
|
||||
|
||||
private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
|
||||
@Requires("referenceReader.isUppercasingBases()")
|
||||
private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
|
||||
final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(),
|
||||
Math.max(1, genomeLoc.getStart() - padding),
|
||||
Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
|
||||
StringUtil.toUpperCase(reference);
|
||||
return reference;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,11 +24,11 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
|
@ -45,6 +45,7 @@ public class ActivityProfile {
|
|||
final GenomeLocParser parser;
|
||||
final boolean presetRegions;
|
||||
GenomeLoc regionStartLoc = null;
|
||||
GenomeLoc regionStopLoc = null;
|
||||
final List<ActivityProfileResult> isActiveList;
|
||||
private static final int FILTER_SIZE = 80;
|
||||
private static final double[] GaussianKernel;
|
||||
|
|
@ -71,19 +72,49 @@ public class ActivityProfile {
|
|||
this.regionStartLoc = regionStartLoc;
|
||||
}
|
||||
|
||||
public void add(final GenomeLoc loc, final ActivityProfileResult result) {
|
||||
if ( loc.size() != 1 )
|
||||
throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" );
|
||||
isActiveList.add(result);
|
||||
if( regionStartLoc == null ) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActivityProfile{" +
|
||||
"start=" + regionStartLoc +
|
||||
", stop=" + regionStopLoc +
|
||||
'}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the next ActivityProfileResult to this profile.
|
||||
*
|
||||
* Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown
|
||||
*
|
||||
* @param result a well-formed ActivityProfileResult result to incorporate into this profile
|
||||
*/
|
||||
@Requires("result != null")
|
||||
public void add(final ActivityProfileResult result) {
|
||||
final GenomeLoc loc = result.getLoc();
|
||||
|
||||
if ( regionStartLoc == null ) {
|
||||
regionStartLoc = loc;
|
||||
regionStopLoc = loc;
|
||||
} else {
|
||||
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
|
||||
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
|
||||
regionStopLoc = loc;
|
||||
}
|
||||
|
||||
isActiveList.add(result);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return isActiveList.size();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return isActiveList.isEmpty();
|
||||
}
|
||||
|
||||
public boolean hasPresetRegions() {
|
||||
return presetRegions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Band pass this ActivityProfile, producing a new profile that's band pass filtered
|
||||
* @return a new ActivityProfile that's the band-pass filtered version of this profile
|
||||
|
|
@ -104,14 +135,21 @@ public class ActivityProfile {
|
|||
}
|
||||
iii++;
|
||||
}
|
||||
final double[] filteredProbArray = new double[activeProbArray.length];
|
||||
|
||||
final double[] filteredProbArray;
|
||||
if( !presetRegions ) {
|
||||
// if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray
|
||||
filteredProbArray = new double[activeProbArray.length];
|
||||
for( iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||
final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
|
||||
final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
|
||||
filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
|
||||
}
|
||||
} else {
|
||||
// otherwise we simply use the activeProbArray directly
|
||||
filteredProbArray = activeProbArray;
|
||||
}
|
||||
|
||||
iii = 0;
|
||||
for( final double prob : filteredProbArray ) {
|
||||
final ActivityProfileResult result = isActiveList.get(iii++);
|
||||
|
|
@ -119,6 +157,7 @@ public class ActivityProfile {
|
|||
result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE;
|
||||
result.resultValue = null;
|
||||
}
|
||||
|
||||
return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc);
|
||||
}
|
||||
|
||||
|
|
@ -166,6 +205,7 @@ public class ActivityProfile {
|
|||
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
|
||||
return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
||||
}
|
||||
|
||||
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
|
||||
if( !isActive || curEnd - curStart < maxRegionSize ) {
|
||||
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
|
||||
|
|
|
|||
|
|
@ -1,12 +1,16 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 7/27/12
|
||||
*/
|
||||
|
||||
public class ActivityProfileResult {
|
||||
private GenomeLoc loc;
|
||||
public double isActiveProb;
|
||||
public ActivityProfileResultState resultState;
|
||||
public Number resultValue;
|
||||
|
|
@ -16,16 +20,52 @@ public class ActivityProfileResult {
|
|||
HIGH_QUALITY_SOFT_CLIPS
|
||||
}
|
||||
|
||||
public ActivityProfileResult( final double isActiveProb ) {
|
||||
this.isActiveProb = isActiveProb;
|
||||
this.resultState = ActivityProfileResultState.NONE;
|
||||
this.resultValue = null;
|
||||
/**
|
||||
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb
|
||||
*
|
||||
* @param loc the position of the result profile (for debugging purposes)
|
||||
* @param isActiveProb the probability of being active (between 0 and 1)
|
||||
*/
|
||||
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
|
||||
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) {
|
||||
this(loc, isActiveProb, ActivityProfileResultState.NONE, null);
|
||||
}
|
||||
|
||||
public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
|
||||
/**
|
||||
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some
|
||||
* information about the result state and value (TODO RYAN -- what do these mean?)
|
||||
*
|
||||
* @param loc the position of the result profile (for debugging purposes)
|
||||
* @param isActiveProb the probability of being active (between 0 and 1)
|
||||
*/
|
||||
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
|
||||
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
|
||||
// make sure the location of that activity profile is 1
|
||||
if ( loc.size() != 1 )
|
||||
throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc);
|
||||
|
||||
this.loc = loc;
|
||||
this.isActiveProb = isActiveProb;
|
||||
this.resultState = resultState;
|
||||
this.resultValue = resultValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the genome loc associated with the ActivityProfileResult
|
||||
* @return the location of this result
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc getLoc() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActivityProfileResult{" +
|
||||
"loc=" + loc +
|
||||
", isActiveProb=" + isActiveProb +
|
||||
", resultState=" + resultState +
|
||||
", resultValue=" + resultValue +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -587,7 +587,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
|
|||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
if ( nParts != genotypeParts.length )
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo);
|
||||
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord;
|
|||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -317,4 +322,33 @@ public class VCFUtils {
|
|||
assembly = "hg19";
|
||||
return assembly;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read all of the VCF records from source into memory, returning the header and the VariantContexts
|
||||
*
|
||||
* @param source the file to read, must be in VCF4 format
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public static Pair<VCFHeader, List<VariantContext>> readVCF(final File source) throws IOException {
|
||||
// read in the features
|
||||
final List<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
|
||||
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
|
||||
|
||||
while ( ! pbs.isDone() ) {
|
||||
final VariantContext vc = codec.decode(pbs);
|
||||
if ( vc != null )
|
||||
vcs.add(vc);
|
||||
}
|
||||
|
||||
return new Pair<VCFHeader, List<VariantContext>>(vcfHeader, vcs);
|
||||
}
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex;
|
|||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.apache.log4j.Priority;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -40,6 +41,8 @@ import java.util.Arrays;
|
|||
* A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
|
||||
*
|
||||
* Thread-safe! Uses a thread-local cache
|
||||
*
|
||||
* Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set
|
||||
*/
|
||||
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
||||
protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class);
|
||||
|
|
@ -54,10 +57,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
public static final long DEFAULT_CACHE_SIZE = 1000000;
|
||||
|
||||
/** The cache size of this CachingIndexedFastaSequenceFile */
|
||||
final long cacheSize;
|
||||
private final long cacheSize;
|
||||
|
||||
/** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */
|
||||
final long cacheMissBackup;
|
||||
private final long cacheMissBackup;
|
||||
|
||||
/**
|
||||
* If true, we will preserve the case of the original base in the genome, not
|
||||
*/
|
||||
private final boolean preserveCase;
|
||||
|
||||
// information about checking efficiency
|
||||
long cacheHits = 0;
|
||||
|
|
@ -84,37 +92,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
/**
|
||||
* Same as general constructor but allows one to override the default cacheSize
|
||||
*
|
||||
* @param fasta
|
||||
* @param index
|
||||
* @param cacheSize
|
||||
* @param fasta the file we will read our FASTA sequence from.
|
||||
* @param index the index of the fasta file, used for efficient random access
|
||||
* @param cacheSize the size in bp of the cache we will use for this reader
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) {
|
||||
super(fasta, index);
|
||||
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
|
||||
* @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
|
||||
this(fasta, index, DEFAULT_CACHE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException {
|
||||
this(fasta, DEFAULT_CACHE_SIZE);
|
||||
this.preserveCase = preserveCase;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -124,12 +112,76 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* Uses provided cacheSize instead of the default
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException {
|
||||
super(fasta);
|
||||
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
|
||||
this.preserveCase = preserveCase;
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
// *
|
||||
// * @param fasta The file to open.
|
||||
// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
|
||||
// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
||||
// */
|
||||
// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
|
||||
// this(fasta, index, DEFAULT_CACHE_SIZE);
|
||||
// }
|
||||
|
||||
/**
|
||||
* Same as general constructor but allows one to override the default cacheSize
|
||||
*
|
||||
* By default, this CachingIndexedFastaReader converts all incoming bases to upper case
|
||||
*
|
||||
* @param fasta the file we will read our FASTA sequence from.
|
||||
* @param index the index of the fasta file, used for efficient random access
|
||||
* @param cacheSize the size in bp of the cache we will use for this reader
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
|
||||
this(fasta, index, cacheSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk.
|
||||
* This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException {
|
||||
this(fasta, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
|
||||
this(fasta, DEFAULT_CACHE_SIZE, preserveCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
*
|
||||
* Looks for a index file for fasta on disk
|
||||
* Uses provided cacheSize instead of the default
|
||||
*
|
||||
* @param fasta The file to open.
|
||||
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
|
||||
this(fasta, cacheSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -168,6 +220,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
return cacheSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is
|
||||
* everything being made upper case?
|
||||
*
|
||||
* @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased
|
||||
*/
|
||||
public boolean isPreservingCase() {
|
||||
return preserveCase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is uppercasing bases?
|
||||
*
|
||||
* @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta
|
||||
*/
|
||||
public boolean isUppercasingBases() {
|
||||
return ! isPreservingCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the subsequence of the contig in the range [start,stop]
|
||||
*
|
||||
|
|
@ -177,8 +248,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* @param contig Contig whose subsequence to retrieve.
|
||||
* @param start inclusive, 1-based start of region.
|
||||
* @param stop inclusive, 1-based stop of region.
|
||||
* @return The partial reference sequence associated with this range.
|
||||
* @return The partial reference sequence associated with this range. If preserveCase is false, then
|
||||
* all of the bases in the ReferenceSequence returned by this method will be upper cased.
|
||||
*/
|
||||
@Override
|
||||
public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) {
|
||||
final ReferenceSequence result;
|
||||
final Cache myCache = cache.get();
|
||||
|
|
@ -186,6 +259,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
if ( (stop - start) >= cacheSize ) {
|
||||
cacheMisses++;
|
||||
result = super.getSubsequenceAt(contig, start, stop);
|
||||
if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
|
||||
} else {
|
||||
// todo -- potential optimization is to check if contig.name == contig, as this in generally will be true
|
||||
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
|
||||
|
|
@ -198,7 +272,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
myCache.start = Math.max(start - cacheMissBackup, 0);
|
||||
myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength());
|
||||
myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop);
|
||||
//System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop);
|
||||
|
||||
// convert all of the bases in the sequence to upper case if we aren't preserving cases
|
||||
if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
|
||||
} else {
|
||||
cacheHits++;
|
||||
}
|
||||
|
|
@ -215,8 +291,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
}
|
||||
}
|
||||
|
||||
// for debugging -- print out our efficiency if requested
|
||||
if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 )
|
||||
printEfficiency(Priority.INFO);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
* Returns a new ReadBackedPileup where only one read from an overlapping read
|
||||
* pair is retained. If the two reads in question disagree to their basecall,
|
||||
* neither read is retained. If they agree on the base, the read with the higher
|
||||
* quality observation is retained
|
||||
* base quality observation is retained
|
||||
*
|
||||
* @return the newly filtered pileup
|
||||
*/
|
||||
@Override
|
||||
public RBP getOverlappingFragmentFilteredPileup() {
|
||||
public ReadBackedPileup getOverlappingFragmentFilteredPileup() {
|
||||
return getOverlappingFragmentFilteredPileup(true, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new ReadBackedPileup where only one read from an overlapping read
|
||||
* pair is retained. If discardDiscordant and the two reads in question disagree to their basecall,
|
||||
* neither read is retained. Otherwise, the read with the higher
|
||||
* quality (base or mapping, depending on baseQualNotMapQual) observation is retained
|
||||
*
|
||||
* @return the newly filtered pileup
|
||||
*/
|
||||
@Override
|
||||
public RBP getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) {
|
||||
if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
|
||||
PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
|
||||
|
||||
for (final String sample : tracker.getSamples()) {
|
||||
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
|
||||
AbstractReadBackedPileup<RBP, PE> pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup();
|
||||
AbstractReadBackedPileup<RBP, PE> pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual);
|
||||
filteredTracker.addElements(sample, pileup.pileupElementTracker);
|
||||
}
|
||||
return (RBP) createNewPileup(loc, filteredTracker);
|
||||
|
|
@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
|
||||
// if the reads disagree at this position, throw them both out. Otherwise
|
||||
// keep the element with the higher quality score
|
||||
if (existing.getBase() != p.getBase()) {
|
||||
if (discardDiscordant && existing.getBase() != p.getBase()) {
|
||||
filteredPileup.remove(readName);
|
||||
} else {
|
||||
if (existing.getQual() < p.getQual()) {
|
||||
filteredPileup.put(readName, p);
|
||||
if (baseQualNotMapQual) {
|
||||
if (existing.getQual() < p.getQual())
|
||||
filteredPileup.put(readName, p);
|
||||
}
|
||||
else {
|
||||
if (existing.getMappingQual() < p.getMappingQual())
|
||||
filteredPileup.put(readName, p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -998,6 +1016,44 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
return quals2String(getQuals());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new ReadBackedPileup that is sorted by start coordinate of the reads.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public RBP getStartSortedPileup() {
|
||||
|
||||
final TreeSet<PE> sortedElements = new TreeSet<PE>(new Comparator<PE>() {
|
||||
@Override
|
||||
public int compare(PE element1, PE element2) {
|
||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
|
||||
if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
|
||||
|
||||
for (final String sample : tracker.getSamples()) {
|
||||
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
|
||||
for (PE pile : perSampleElements)
|
||||
sortedElements.add(pile);
|
||||
}
|
||||
}
|
||||
else {
|
||||
UnifiedPileupElementTracker<PE> tracker = (UnifiedPileupElementTracker<PE>) pileupElementTracker;
|
||||
for (PE pile : tracker)
|
||||
sortedElements.add(pile);
|
||||
}
|
||||
|
||||
UnifiedPileupElementTracker<PE> sortedTracker = new UnifiedPileupElementTracker<PE>();
|
||||
for (PE pile : sortedElements)
|
||||
sortedTracker.add(pile);
|
||||
|
||||
return (RBP) createNewPileup(loc, sortedTracker);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FragmentCollection<PileupElement> toFragments() {
|
||||
return FragmentUtils.create(this);
|
||||
|
|
|
|||
|
|
@ -60,6 +60,16 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
*/
|
||||
public ReadBackedPileup getOverlappingFragmentFilteredPileup();
|
||||
|
||||
/**
|
||||
* Returns a new ReadBackedPileup where only one read from an overlapping read
|
||||
* pair is retained. If discardDiscordant and the two reads in question disagree to their basecall,
|
||||
* neither read is retained. Otherwise, the read with the higher
|
||||
* quality (base or mapping, depending on baseQualNotMapQual) observation is retained
|
||||
*
|
||||
* @return the newly filtered pileup
|
||||
*/
|
||||
public ReadBackedPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual);
|
||||
|
||||
/**
|
||||
* Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this
|
||||
* does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy
|
||||
|
|
@ -261,6 +271,13 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
*/
|
||||
public byte[] getMappingQuals();
|
||||
|
||||
/**
|
||||
* Returns a new ReadBackedPileup that is sorted by start coordinate of the reads.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public ReadBackedPileup getStartSortedPileup();
|
||||
|
||||
/**
|
||||
* Converts this pileup into a FragmentCollection (see FragmentUtils for documentation)
|
||||
* @return
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ public class RecalDatum {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("%.2f,%,2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
|
||||
return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
|
||||
}
|
||||
|
||||
public String stringForCSV() {
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ import java.util.EnumSet;
|
|||
|
||||
public class CycleCovariate implements StandardCovariate {
|
||||
|
||||
private static final int MAXIMUM_CYCLE_VALUE = 1000;
|
||||
private int MAXIMUM_CYCLE_VALUE;
|
||||
private static final int CUSHION_FOR_INDELS = 4;
|
||||
private String default_platform = null;
|
||||
|
||||
|
|
@ -59,6 +59,8 @@ public class CycleCovariate implements StandardCovariate {
|
|||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE;
|
||||
|
||||
if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM))
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform.");
|
||||
|
||||
|
|
@ -88,6 +90,9 @@ public class CycleCovariate implements StandardCovariate {
|
|||
|
||||
final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1;
|
||||
for (int i = 0; i < readLength; i++) {
|
||||
if ( cycle > MAXIMUM_CYCLE_VALUE )
|
||||
throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle was detected in read " + read.getReadName() + ". Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)");
|
||||
|
||||
final int substitutionKey = keyFromCycle(cycle);
|
||||
final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey;
|
||||
values.addCovariate(substitutionKey, indelKey, indelKey, i);
|
||||
|
|
|
|||
|
|
@ -646,7 +646,7 @@ public class AlignmentUtils {
|
|||
|
||||
// get the indel element and move it left one base
|
||||
CigarElement ce = cigar.getCigarElement(indexOfIndel - 1);
|
||||
elements.add(new CigarElement(ce.getLength() - 1, ce.getOperator()));
|
||||
elements.add(new CigarElement(Math.max(ce.getLength() - 1, 0), ce.getOperator()));
|
||||
elements.add(cigar.getCigarElement(indexOfIndel));
|
||||
if (indexOfIndel + 1 < cigar.numCigarElements()) {
|
||||
ce = cigar.getCigarElement(indexOfIndel + 1);
|
||||
|
|
|
|||
|
|
@ -184,9 +184,6 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
protected CommonInfo commonInfo = null;
|
||||
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
|
||||
|
||||
@Deprecated // ID is no longer stored in the attributes map
|
||||
private final static String ID_KEY = "ID";
|
||||
|
||||
public final static Set<String> PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet<String>());
|
||||
|
||||
/** The location of this VariantContext */
|
||||
|
|
@ -287,10 +284,6 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
|
||||
this.commonInfo = new CommonInfo(source, log10PError, filters, attributes);
|
||||
|
||||
// todo -- remove me when this check is no longer necessary
|
||||
if ( this.commonInfo.hasAttribute(ID_KEY) )
|
||||
throw new IllegalArgumentException("Trying to create a VariantContext with a ID key. Please use provided constructor argument ID");
|
||||
|
||||
if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); }
|
||||
|
||||
// we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles
|
||||
|
|
|
|||
|
|
@ -979,6 +979,40 @@ public class VariantContextUtils {
|
|||
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
|
||||
/**
|
||||
* Split variant context into its biallelic components if there are more than 2 alleles
|
||||
*
|
||||
* For VC has A/B/C alleles, returns A/B and A/C contexts.
|
||||
* Genotypes are all no-calls now (it's not possible to fix them easily)
|
||||
* Alleles are right trimmed to satisfy VCF conventions
|
||||
*
|
||||
* If vc is biallelic or non-variant it is just returned
|
||||
*
|
||||
* Chromosome counts are updated (but they are by definition 0)
|
||||
*
|
||||
* @param vc a potentially multi-allelic variant context
|
||||
* @return a list of bi-allelic (or monomorphic) variant context
|
||||
*/
|
||||
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc) {
|
||||
if ( ! vc.isVariant() || vc.isBiallelic() )
|
||||
// non variant or biallelics already satisfy the contract
|
||||
return Collections.singletonList(vc);
|
||||
else {
|
||||
final List<VariantContext> biallelics = new LinkedList<VariantContext>();
|
||||
|
||||
for ( final Allele alt : vc.getAlternateAlleles() ) {
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
calculateChromosomeCounts(builder, true);
|
||||
biallelics.add(reverseTrimAlleles(builder.make()));
|
||||
}
|
||||
|
||||
return biallelics;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
|
||||
*
|
||||
|
|
|
|||
|
|
@ -76,7 +76,6 @@ public class BCF2FieldWriterManager {
|
|||
if ( map.containsKey(field) )
|
||||
throw new ReviewedStingException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders");
|
||||
map.put(field, writer);
|
||||
if ( logger.isDebugEnabled() ) logger.debug(writer);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
package org.broadinstitute.sting;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.testng.IAnnotationTransformer;
|
||||
import org.testng.annotations.ITestAnnotation;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
/**
|
||||
* Provide default @Test values for GATK testng tests.
|
||||
*
|
||||
* Currently only sets the maximum runtime to 10 minutes, if it's not been specified.
|
||||
*
|
||||
* See http://beust.com/weblog/2006/10/18/annotation-transformers-in-java/
|
||||
*
|
||||
* @author depristo
|
||||
* @since 10/31/12
|
||||
* @version 0.1
|
||||
*/
|
||||
public class TestNGTestTransformer implements IAnnotationTransformer {
|
||||
public static final long DEFAULT_TIMEOUT = 1000 * 60 * 10; // 10 minutes max per test
|
||||
|
||||
final static Logger logger = Logger.getLogger(TestNGTestTransformer.class);
|
||||
|
||||
public void transform(ITestAnnotation annotation,
|
||||
Class testClass,
|
||||
Constructor testConstructor,
|
||||
Method testMethod)
|
||||
{
|
||||
if ( annotation.getTimeOut() == 0 ) {
|
||||
logger.warn("test " + testMethod.toString() + " has no specified timeout, adding default timeout " + DEFAULT_TIMEOUT / 1000 / 60 + " minutes");
|
||||
annotation.setTimeOut(DEFAULT_TIMEOUT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,358 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.executive.WindowMaker;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: thibault
|
||||
* Date: 11/13/12
|
||||
* Time: 2:47 PM
|
||||
*
|
||||
* Test the Active Region Traversal Contract
|
||||
* http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
|
||||
*/
|
||||
public class TraverseActiveRegionsTest extends BaseTest {
|
||||
|
||||
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
|
||||
private final double prob;
|
||||
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
|
||||
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
|
||||
|
||||
public DummyActiveRegionWalker() {
|
||||
this.prob = 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
isActiveCalls.add(ref.getLocus());
|
||||
return new ActivityProfileResult(ref.getLocus(), prob);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
|
||||
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegions<Integer, Integer>();
|
||||
|
||||
private IndexedFastaSequenceFile reference;
|
||||
private SAMSequenceDictionary dictionary;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
private List<GenomeLoc> intervals;
|
||||
private List<SAMRecord> reads;
|
||||
|
||||
@BeforeClass
|
||||
private void init() throws FileNotFoundException {
|
||||
reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
|
||||
dictionary = reference.getSequenceDictionary();
|
||||
genomeLocParser = new GenomeLocParser(dictionary);
|
||||
|
||||
intervals = new ArrayList<GenomeLoc>();
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
|
||||
// TODO: this fails!
|
||||
//intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 20000));
|
||||
intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
|
||||
|
||||
reads = new ArrayList<SAMRecord>();
|
||||
reads.add(buildSAMRecord("simple", "1", 100, 200));
|
||||
reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
|
||||
reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
|
||||
reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
|
||||
reads.add(buildSAMRecord("boundary_unequal", "1", 1995, 2050));
|
||||
reads.add(buildSAMRecord("extended_only", "1", 3000, 3100));
|
||||
reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
|
||||
reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
|
||||
// TODO
|
||||
//reads.add(buildSAMRecord("simple20", "20", 10100, 10150));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllBasesSeen() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
|
||||
// Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
|
||||
verifyEqualIntervals(intervals, activeIntervals);
|
||||
|
||||
// TODO: more tests and edge cases
|
||||
}
|
||||
|
||||
private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
|
||||
List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
|
||||
for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) {
|
||||
t.traverse(walker, dataProvider, 0);
|
||||
activeIntervals.addAll(walker.isActiveCalls);
|
||||
}
|
||||
|
||||
return activeIntervals;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testActiveRegionCoverage() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
|
||||
verifyActiveRegionCoverage(intervals, activeRegions);
|
||||
|
||||
// TODO: more tests and edge cases
|
||||
}
|
||||
|
||||
private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
|
||||
List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
|
||||
List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
|
||||
|
||||
for (GenomeLoc interval : intervals) {
|
||||
intervalStarts.add(interval.getStartLocation());
|
||||
intervalStops.add(interval.getStopLocation());
|
||||
}
|
||||
|
||||
Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
|
||||
|
||||
for (ActiveRegion activeRegion : activeRegions) {
|
||||
for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
|
||||
// Contract: Regions do not overlap
|
||||
Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
|
||||
baseRegionMap.put(activeLoc, activeRegion);
|
||||
}
|
||||
|
||||
GenomeLoc start = activeRegion.getLocation().getStartLocation();
|
||||
if (intervalStarts.contains(start))
|
||||
intervalStarts.remove(start);
|
||||
|
||||
GenomeLoc stop = activeRegion.getLocation().getStopLocation();
|
||||
if (intervalStops.contains(stop))
|
||||
intervalStops.remove(stop);
|
||||
}
|
||||
|
||||
for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
|
||||
// Contract: Each location in the interval(s) is in exactly one region
|
||||
// Contract: The total set of regions exactly matches the analysis interval(s)
|
||||
Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
|
||||
baseRegionMap.remove(baseLoc);
|
||||
}
|
||||
|
||||
// Contract: The total set of regions exactly matches the analysis interval(s)
|
||||
Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
|
||||
|
||||
// Contract: All explicit interval boundaries must also be region boundaries
|
||||
Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
|
||||
Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadMapping() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
||||
// Contract: Each read has the Non-Primary state in all other regions it overlaps
|
||||
// Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
|
||||
|
||||
// simple: Primary in 1:1-999
|
||||
// overlap_equal: Primary in 1:1-999
|
||||
// overlap_unequal: Primary in 1:1-999
|
||||
// boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_unequal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// extended_only: Extended in 1:2000-2999
|
||||
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
|
||||
// outside_intervals: none
|
||||
|
||||
// TODO
|
||||
// simple20: Primary in 20:10000-20000
|
||||
|
||||
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
|
||||
ActiveRegion region;
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
|
||||
|
||||
verifyReadPrimary(region, "simple");
|
||||
verifyReadPrimary(region, "overlap_equal");
|
||||
verifyReadPrimary(region, "overlap_unequal");
|
||||
verifyReadNotPlaced(region, "boundary_equal");
|
||||
verifyReadNotPlaced(region, "boundary_unequal");
|
||||
verifyReadNotPlaced(region, "extended_only");
|
||||
// TODO: fail verifyReadNonPrimary(region, "extended_and_np");
|
||||
verifyReadNotPlaced(region, "outside_intervals");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
|
||||
|
||||
verifyReadNotPlaced(region, "simple");
|
||||
verifyReadNotPlaced(region, "overlap_equal");
|
||||
verifyReadNotPlaced(region, "overlap_unequal");
|
||||
// TODO: fail verifyReadPrimary(region, "boundary_equal");
|
||||
// TODO: fail verifyReadNonPrimary(region, "boundary_unequal");
|
||||
verifyReadNotPlaced(region, "extended_only");
|
||||
// TODO: fail verifyReadPrimary(region, "extended_and_np");
|
||||
verifyReadNotPlaced(region, "outside_intervals");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
|
||||
verifyReadNotPlaced(region, "simple");
|
||||
verifyReadNotPlaced(region, "overlap_equal");
|
||||
verifyReadNotPlaced(region, "overlap_unequal");
|
||||
// TODO: fail verifyReadNonPrimary(region, "boundary_equal");
|
||||
verifyReadPrimary(region, "boundary_unequal");
|
||||
// TODO: fail verifyReadExtended(region, "extended_only");
|
||||
// TODO: fail verifyReadExtended(region, "extended_and_np");
|
||||
verifyReadNotPlaced(region, "outside_intervals");
|
||||
|
||||
// TODO: more tests and edge cases
|
||||
}
|
||||
|
||||
private void verifyReadPrimary(ActiveRegion region, String readName) {
|
||||
SAMRecord read = getRead(region, readName);
|
||||
Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region);
|
||||
}
|
||||
|
||||
private void verifyReadNonPrimary(ActiveRegion region, String readName) {
|
||||
SAMRecord read = getRead(region, readName);
|
||||
Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region);
|
||||
}
|
||||
|
||||
private void verifyReadExtended(ActiveRegion region, String readName) {
|
||||
Assert.fail("The Extended read state has not been implemented");
|
||||
}
|
||||
|
||||
private void verifyReadNotPlaced(ActiveRegion region, String readName) {
|
||||
for (SAMRecord read : region.getReads()) {
|
||||
if (read.getReadName().equals(readName))
|
||||
Assert.fail("Read " + readName + " found in active region " + region);
|
||||
}
|
||||
}
|
||||
|
||||
private SAMRecord getRead(ActiveRegion region, String readName) {
|
||||
for (SAMRecord read : region.getReads()) {
|
||||
if (read.getReadName().equals(readName))
|
||||
return read;
|
||||
}
|
||||
|
||||
Assert.fail("Read " + readName + " not found in active region " + region);
|
||||
return null;
|
||||
}
|
||||
|
||||
private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
|
||||
for (LocusShardDataProvider dataProvider : createDataProviders(intervals))
|
||||
t.traverse(walker, dataProvider, 0);
|
||||
|
||||
return walker.mappedActiveRegions;
|
||||
}
|
||||
|
||||
private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
|
||||
List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
|
||||
if (interval.size() == 1)
|
||||
bases.add(interval);
|
||||
else {
|
||||
for (int location = interval.getStart(); location <= interval.getStop(); location++)
|
||||
bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
|
||||
}
|
||||
|
||||
return bases;
|
||||
}
|
||||
|
||||
private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
|
||||
Set<GenomeLoc> bases = new TreeSet<GenomeLoc>(); // for sorting and uniqueness
|
||||
for (GenomeLoc interval : intervals)
|
||||
bases.addAll(toSingleBaseLocs(interval));
|
||||
|
||||
return bases;
|
||||
}
|
||||
|
||||
private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
|
||||
Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
|
||||
Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
|
||||
|
||||
Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
|
||||
|
||||
Iterator<GenomeLoc> aIter = aBases.iterator();
|
||||
Iterator<GenomeLoc> bIter = bBases.iterator();
|
||||
while (aIter.hasNext() && bIter.hasNext()) {
|
||||
GenomeLoc aLoc = aIter.next();
|
||||
GenomeLoc bLoc = bIter.next();
|
||||
Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
|
||||
}
|
||||
}
|
||||
|
||||
// copied from LocusViewTemplate
|
||||
protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
header.setSequenceDictionary(dictionary);
|
||||
GATKSAMRecord record = new GATKSAMRecord(header);
|
||||
|
||||
record.setReadName(readName);
|
||||
record.setReferenceIndex(dictionary.getSequenceIndex(contig));
|
||||
record.setAlignmentStart(alignmentStart);
|
||||
|
||||
Cigar cigar = new Cigar();
|
||||
int len = alignmentEnd - alignmentStart + 1;
|
||||
cigar.add(new CigarElement(len, CigarOperator.M));
|
||||
record.setCigar(cigar);
|
||||
record.setReadBases(new byte[len]);
|
||||
record.setBaseQualities(new byte[len]);
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
private List<LocusShardDataProvider> createDataProviders(List<GenomeLoc> intervals) {
|
||||
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
engine.setGenomeLocParser(genomeLocParser);
|
||||
t.initialize(engine);
|
||||
|
||||
StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(reads);
|
||||
Shard shard = new MockLocusShard(genomeLocParser, intervals);
|
||||
|
||||
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
|
||||
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) {
|
||||
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
|
||||
}
|
||||
|
||||
return providers;
|
||||
}
|
||||
}
|
||||
|
|
@ -151,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testTabixAnnotations() {
|
||||
public void testTabixAnnotationsAndParallelism() {
|
||||
final String MD5 = "99938d1e197b8f10c408cac490a00a62";
|
||||
for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
@ -159,6 +159,12 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(MD5));
|
||||
executeTest("Testing lookup vcf tabix vs. vcf tribble", spec);
|
||||
}
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1,
|
||||
Arrays.asList(MD5));
|
||||
|
||||
executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
|
|||
public void testCallableLociWalkerBed() {
|
||||
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
|
||||
Arrays.asList("9e4ec9c23f21a8162d27a39ab057398c", SUMMARY_MD5));
|
||||
Arrays.asList("42e86c06c167246a28bffdacaca75ffb", SUMMARY_MD5));
|
||||
executeTest("formatBed", spec);
|
||||
}
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
|
|||
public void testCallableLociWalkerPerBase() {
|
||||
String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
|
||||
Arrays.asList("e6044b4495ef24f542403e6a94437068", SUMMARY_MD5));
|
||||
Arrays.asList("d66c525d9c70f62df8156261d3e535ad", SUMMARY_MD5));
|
||||
executeTest("format_state_per_base", spec);
|
||||
}
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
|
|||
public void testCallableLociWalker2() {
|
||||
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
|
||||
Arrays.asList("c671f65712d9575b8b3e1f1dbedc146e", "d287510eac04acf5a56f5cde2cba0e4a"));
|
||||
Arrays.asList("330f476085533db92a9dbdb3a127c041", "d287510eac04acf5a56f5cde2cba0e4a"));
|
||||
executeTest("formatBed by interval", spec);
|
||||
}
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
|
|||
public void testCallableLociWalker3() {
|
||||
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
|
||||
Arrays.asList("b7d26a470ef906590249f2fa45fd6bdd", "da431d393f7c2b2b3e27556b86c1dbc7"));
|
||||
Arrays.asList("46a53379aaaf9803276a0a34b234f6ab", "da431d393f7c2b2b3e27556b86c1dbc7"));
|
||||
executeTest("formatBed lots of arguments", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import java.util.ArrayList;
|
|||
|
||||
public class UnifiedGenotyperLargeScaleTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
@Test( timeOut = 18000000 )
|
||||
public void testUnifiedGenotyperWholeGenome() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R " + hg18Reference +
|
||||
|
|
@ -22,7 +22,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest {
|
|||
executeTest("testUnifiedGenotyperWholeGenome", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test( timeOut = 18000000 )
|
||||
public void testUnifiedGenotyperWholeExome() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R " + hg18Reference +
|
||||
|
|
@ -37,7 +37,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest {
|
|||
executeTest("testUnifiedGenotyperWholeExome", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test( timeOut = 18000000 )
|
||||
public void testUnifiedGenotyperWGParallel() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R " + hg18Reference +
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.testng.SkipException;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
// ********************************************************************************** //
|
||||
// Note that this class also serves as an integration test for the VariantAnnotator! //
|
||||
// ********************************************************************************** //
|
||||
|
||||
public class UnifiedGenotyperLiteIntegrationTest extends WalkerTest {
|
||||
|
||||
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing contamination down-sampling gets ignored
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testContaminationDownsampling() {
|
||||
if ( !GATKLiteUtils.isGATKLite() )
|
||||
throw new SkipException("Only want to test for GATK lite");
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("9addd225a985178339a0c49dc5fdc220"));
|
||||
executeTest("test contamination_percentage_to_filter gets ignored", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -6,7 +6,7 @@ import org.testng.annotations.Test;
|
|||
import java.util.ArrayList;
|
||||
|
||||
public class IndelRealignerLargeScaleTest extends WalkerTest {
|
||||
@Test
|
||||
@Test( timeOut = 18000000 )
|
||||
public void testHighCoverage() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
||||
|
|
@ -21,7 +21,7 @@ public class IndelRealignerLargeScaleTest extends WalkerTest {
|
|||
executeTest("testIndelRealignerHighCoverage", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test( timeOut = 18000000 )
|
||||
public void testRealigner() {
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import org.testng.annotations.Test;
|
|||
import java.util.ArrayList;
|
||||
|
||||
public class RealignerTargetCreatorLargeScaleTest extends WalkerTest {
|
||||
@Test
|
||||
@Test( timeOut = 18000000 )
|
||||
public void testRealignerTargetCreator() {
|
||||
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
import static org.testng.Assert.assertFalse;
|
||||
import static org.testng.Assert.assertTrue;
|
||||
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
|
|||
assertTrue(loc.getContigIndex() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void overlap() {
|
||||
for ( int i = 1; i < 6; i++ ) {
|
||||
final int start = i * 10;
|
||||
mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1));
|
||||
}
|
||||
|
||||
// test matches in and around interval
|
||||
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11)));
|
||||
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12)));
|
||||
|
||||
// test matches spanning intervals
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
|
||||
|
||||
// test miss
|
||||
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19)));
|
||||
|
||||
// test exact match after miss
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41)));
|
||||
|
||||
// test matches at beginning of intervals
|
||||
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10)));
|
||||
|
||||
// test matches at end of intervals
|
||||
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
|
||||
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
|
||||
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergingOverlappingAbove() {
|
||||
GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50);
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ public class HaplotypeUnitTest extends BaseTest {
|
|||
final VariantContext vc = new VariantContextBuilder().alleles(alleles).loc("1", loc, loc + h1refAllele.getBases().length - 1).make();
|
||||
h.setAlignmentStartHapwrtRef(0);
|
||||
h.setCigar(cigar);
|
||||
final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc);
|
||||
final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc, vc.getStart());
|
||||
final Haplotype h1expected = new Haplotype(newHap.getBytes());
|
||||
Assert.assertEquals(h1, h1expected);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -123,7 +123,7 @@ public class ActivityProfileUnitTest extends BaseTest {
|
|||
for ( int i = 0; i < cfg.probs.size(); i++ ) {
|
||||
double p = cfg.probs.get(i);
|
||||
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
|
||||
profile.add(loc, new ActivityProfileResult(p));
|
||||
profile.add(new ActivityProfileResult(loc, p));
|
||||
}
|
||||
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import java.util.concurrent.Executors;
|
|||
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
||||
private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
|
||||
private static final int STEP_SIZE = 1;
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
//private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
|
||||
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100);
|
||||
|
|
@ -53,9 +54,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested;
|
||||
}
|
||||
|
||||
@Test(dataProvider = "fastas", enabled = true)
|
||||
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
|
||||
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize));
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
|
||||
|
||||
SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
|
||||
logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
|
||||
|
|
@ -64,6 +65,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
|
||||
Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
|
||||
|
||||
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
|
||||
|
||||
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
|
||||
|
|
@ -92,10 +95,10 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
// Tests grabbing sequences around a middle cached value.
|
||||
@Test(dataProvider = "fastas", enabled = true)
|
||||
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
|
||||
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
|
||||
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize));
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
|
||||
|
||||
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
|
||||
|
||||
|
|
@ -123,11 +126,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
@DataProvider(name = "ParallelFastaTest")
|
||||
public Object[][] createParallelFastaTest() {
|
||||
List<Object[]> params = new ArrayList<Object[]>();
|
||||
// for ( int nt : Arrays.asList(1, 2, 3) ) {
|
||||
// for ( int cacheSize : CACHE_SIZES ) {
|
||||
// params.add(new Object[]{simpleFasta, cacheSize, 10, nt});
|
||||
// }
|
||||
// }
|
||||
|
||||
for ( File fasta : Arrays.asList(simpleFasta) ) {
|
||||
for ( int cacheSize : CACHE_SIZES ) {
|
||||
|
|
@ -143,9 +141,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000)
|
||||
@Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
|
||||
public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize));
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
|
||||
|
||||
logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
|
||||
for ( int iterations = 0; iterations < 1; iterations++ ) {
|
||||
|
|
@ -163,4 +161,49 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
executor.shutdownNow();
|
||||
}
|
||||
}
|
||||
|
||||
// make sure some bases are lower case and some are upper case
|
||||
@Test(enabled = true)
|
||||
public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException {
|
||||
final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
|
||||
final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true);
|
||||
final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA));
|
||||
|
||||
int nMixedCase = 0;
|
||||
for ( SAMSequenceRecord contig : original.getSequenceDictionary().getSequences() ) {
|
||||
nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);
|
||||
|
||||
final int step = 100;
|
||||
for ( int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step ) {
|
||||
testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state");
|
||||
}
|
||||
|
||||
private int testCases(final IndexedFastaSequenceFile original,
|
||||
final IndexedFastaSequenceFile casePreserving,
|
||||
final IndexedFastaSequenceFile allUpper,
|
||||
final String contig, final int start, final int stop ) {
|
||||
final String orig = fetchBaseString(original, contig, start, stop);
|
||||
final String keptCase = fetchBaseString(casePreserving, contig, start, stop);
|
||||
final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase();
|
||||
|
||||
final String origToUpper = orig.toUpperCase();
|
||||
if ( ! orig.equals(origToUpper) ) {
|
||||
Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig);
|
||||
Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) {
|
||||
if ( start == -1 )
|
||||
return new String(reader.getSequence(contig).getBases());
|
||||
else
|
||||
return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -24,7 +25,7 @@ public class CycleCovariateUnitTest {
|
|||
covariate.initialize(RAC);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testSimpleCycles() {
|
||||
short readLength = 10;
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
|
||||
|
|
@ -53,9 +54,31 @@ public class CycleCovariateUnitTest {
|
|||
for (short i = 0; i < values.length; i++) {
|
||||
short actual = Short.decode(covariate.formatKey(values[i][0]));
|
||||
int expected = init + (increment * i);
|
||||
// System.out.println(String.format("%d: %d, %d", i, actual, expected));
|
||||
Assert.assertEquals(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions={UserException.class})
|
||||
public void testMoreThanMaxCycleFails() {
|
||||
int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1;
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
|
||||
read.setReadPairedFlag(true);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
||||
read.getReadGroup().setPlatform("illumina");
|
||||
|
||||
ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
|
||||
covariate.recordValues(read, readCovariates);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMaxCyclePasses() {
|
||||
int readLength = RAC.MAXIMUM_CYCLE_VALUE;
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
|
||||
read.setReadPairedFlag(true);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
||||
read.getReadGroup().setPlatform("illumina");
|
||||
|
||||
ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
|
||||
covariate.recordValues(read, readCovariates);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -782,7 +782,7 @@ public class VariantContextTestProvider {
|
|||
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
|
||||
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
|
||||
Assert.assertEquals(actual.getID(), expected.getID(), "id");
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles");
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
|
||||
|
||||
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
|
||||
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext;
|
|||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -39,7 +39,7 @@ import java.io.FileNotFoundException;
|
|||
import java.util.*;
|
||||
|
||||
public class VariantContextUtilsUnitTest extends BaseTest {
|
||||
Allele Aref, T, C, Cref, ATC, ATCATC;
|
||||
Allele Aref, T, C, G, Cref, ATC, ATCATC;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
@BeforeSuite
|
||||
|
|
@ -58,6 +58,7 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
Cref = Allele.create("C", true);
|
||||
T = Allele.create("T");
|
||||
C = Allele.create("C");
|
||||
G = Allele.create("G");
|
||||
ATC = Allele.create("ATC");
|
||||
ATCATC = Allele.create("ATCATC");
|
||||
}
|
||||
|
|
@ -697,10 +698,120 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "ReverseClippingPositionTestProvider")
|
||||
public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) {
|
||||
int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false);
|
||||
Assert.assertEquals(result, cfg.expectedClip);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// test splitting into bi-allelics
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "SplitBiallelics")
|
||||
public Object[][] makeSplitBiallelics() throws CloneNotSupportedException {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C));
|
||||
|
||||
// biallelic -> biallelic
|
||||
tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
|
||||
|
||||
// monos -> monos
|
||||
root.alleles(Arrays.asList(Aref));
|
||||
tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
|
||||
|
||||
root.alleles(Arrays.asList(Aref, C, T));
|
||||
tests.add(new Object[]{root.make(),
|
||||
Arrays.asList(
|
||||
root.alleles(Arrays.asList(Aref, C)).make(),
|
||||
root.alleles(Arrays.asList(Aref, T)).make())});
|
||||
|
||||
root.alleles(Arrays.asList(Aref, C, T, G));
|
||||
tests.add(new Object[]{root.make(),
|
||||
Arrays.asList(
|
||||
root.alleles(Arrays.asList(Aref, C)).make(),
|
||||
root.alleles(Arrays.asList(Aref, T)).make(),
|
||||
root.alleles(Arrays.asList(Aref, G)).make())});
|
||||
|
||||
final Allele C = Allele.create("C");
|
||||
final Allele CA = Allele.create("CA");
|
||||
final Allele CAA = Allele.create("CAA");
|
||||
final Allele CAAAA = Allele.create("CAAAA");
|
||||
final Allele CAAAAA = Allele.create("CAAAAA");
|
||||
final Allele Cref = Allele.create("C", true);
|
||||
final Allele CAref = Allele.create("CA", true);
|
||||
final Allele CAAref = Allele.create("CAA", true);
|
||||
final Allele CAAAref = Allele.create("CAAA", true);
|
||||
|
||||
root.alleles(Arrays.asList(Cref, CA, CAA));
|
||||
tests.add(new Object[]{root.make(),
|
||||
Arrays.asList(
|
||||
root.alleles(Arrays.asList(Cref, CA)).make(),
|
||||
root.alleles(Arrays.asList(Cref, CAA)).make())});
|
||||
|
||||
root.alleles(Arrays.asList(CAAref, C, CA)).stop(12);
|
||||
tests.add(new Object[]{root.make(),
|
||||
Arrays.asList(
|
||||
root.alleles(Arrays.asList(CAAref, C)).make(),
|
||||
root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
|
||||
|
||||
root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13);
|
||||
tests.add(new Object[]{root.make(),
|
||||
Arrays.asList(
|
||||
root.alleles(Arrays.asList(CAAAref, C)).make(),
|
||||
root.alleles(Arrays.asList(CAAref, C)).stop(12).make(),
|
||||
root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
|
||||
|
||||
root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13);
|
||||
tests.add(new Object[]{root.make(),
|
||||
Arrays.asList(
|
||||
root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(),
|
||||
root.alleles(Arrays.asList(Cref, CA)).stop(10).make(),
|
||||
root.alleles(Arrays.asList(CAref, C)).stop(11).make(),
|
||||
root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SplitBiallelics")
|
||||
public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
|
||||
final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc);
|
||||
Assert.assertEquals(biallelics.size(), expectedBiallelics.size());
|
||||
for ( int i = 0; i < biallelics.size(); i++ ) {
|
||||
final VariantContext actual = biallelics.get(i);
|
||||
final VariantContext expected = expectedBiallelics.get(i);
|
||||
VariantContextTestProvider.assertEquals(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes")
|
||||
public void testSplitBiallelicsGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
|
||||
int sampleI = 0;
|
||||
for ( final List<Allele> alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) {
|
||||
genotypes.add(GenotypeBuilder.create("sample" + sampleI, alleles));
|
||||
}
|
||||
genotypes.add(GenotypeBuilder.createMissing("missing", 2));
|
||||
|
||||
final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make();
|
||||
|
||||
final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes);
|
||||
for ( int i = 0; i < biallelics.size(); i++ ) {
|
||||
final VariantContext actual = biallelics.get(i);
|
||||
Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples
|
||||
|
||||
for ( final Genotype inputGenotype : genotypes ) {
|
||||
final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName());
|
||||
Assert.assertNotNull(actualGenotype);
|
||||
if ( ! vc.isVariant() || vc.isBiallelic() )
|
||||
Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName()));
|
||||
else
|
||||
Assert.assertTrue(actualGenotype.isNoCall());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,507 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.CNV
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.util.VCF_BAM_utilities
|
||||
import org.broadinstitute.sting.queue.util.DoC._
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import java.io.{PrintStream, PrintWriter}
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils
|
||||
|
||||
class xhmmCNVpipeline extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true)
|
||||
var bams: File = _
|
||||
|
||||
@Input(doc = "gatk jar file", shortName = "J", required = true)
|
||||
var gatkJarFile: File = _
|
||||
|
||||
@Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
|
||||
var xhmmExec: File = _
|
||||
|
||||
@Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
|
||||
var pseqExec: File = _
|
||||
|
||||
@Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true)
|
||||
var pseqSeqDB: String = _
|
||||
|
||||
@Input(shortName = "R", doc = "ref", required = true)
|
||||
var referenceFile: File = _
|
||||
|
||||
@Input(shortName = "L", doc = "Intervals", required = false)
|
||||
var intervals: File = _
|
||||
|
||||
@Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false)
|
||||
var scatterCountInput = 0
|
||||
|
||||
@Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
|
||||
var samplesPerJob = 1
|
||||
|
||||
@Output(doc = "Base name for files to output", shortName = "o", required = true)
|
||||
var outputBase: File = _
|
||||
|
||||
@Hidden
|
||||
@Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false)
|
||||
var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS
|
||||
|
||||
@Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
|
||||
var MAX_DEPTH = 20000
|
||||
|
||||
@Hidden
|
||||
@Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
|
||||
var NUM_BINS = 200
|
||||
|
||||
@Hidden
|
||||
@Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
|
||||
var START_BIN = 1
|
||||
|
||||
@Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
|
||||
var minMappingQuality = 0
|
||||
|
||||
@Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false)
|
||||
var minBaseQuality = 0
|
||||
|
||||
@Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
|
||||
var wholeMatrixMemory = -1
|
||||
|
||||
@Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false)
|
||||
var minTargGC : Double = 0.1
|
||||
|
||||
@Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false)
|
||||
var maxTargGC : Double = 0.9
|
||||
|
||||
@Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false)
|
||||
var minTargRepeats : Double = 0.0
|
||||
|
||||
@Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false)
|
||||
var maxTargRepeats : Double = 0.1
|
||||
|
||||
@Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false)
|
||||
var sampleIDsMap: String = ""
|
||||
|
||||
@Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false)
|
||||
var sampleIDsMapFromColumn = 1
|
||||
|
||||
@Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false)
|
||||
var sampleIDsMapToColumn = 2
|
||||
|
||||
@Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false)
|
||||
var targetSampleFiltersString: String = ""
|
||||
|
||||
@Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false)
|
||||
var PCAnormalizeMethodString: String = ""
|
||||
|
||||
@Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false)
|
||||
var targetSampleNormalizedFiltersString: String = ""
|
||||
|
||||
@Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true)
|
||||
var xhmmParamsArg: File = _
|
||||
|
||||
@Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false)
|
||||
var discoverCommandLineParams: String = ""
|
||||
|
||||
@Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false)
|
||||
var genotypeCommandLineParams: String = ""
|
||||
|
||||
@Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false)
|
||||
var genotypeSubsegments: Boolean = false
|
||||
|
||||
@Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false)
|
||||
var maxTargetsInSubsegment = 30
|
||||
|
||||
@Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false)
|
||||
var subsegmentGenotypeThreshold = 20.0
|
||||
|
||||
@Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
|
||||
var longJobQueue: String = ""
|
||||
|
||||
|
||||
val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list"
|
||||
|
||||
val RD_OUTPUT_SUFFIX: String = ".RD.txt"
|
||||
|
||||
val TARGS_GC_SUFFIX = ".locus_GC.txt"
|
||||
val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt"
|
||||
|
||||
val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt"
|
||||
val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt"
|
||||
|
||||
val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt"
|
||||
val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt"
|
||||
|
||||
|
||||
trait WholeMatrixMemoryLimit extends CommandLineFunction {
|
||||
// Since loading ALL of the data can take significant memory:
|
||||
if (wholeMatrixMemory < 0) {
|
||||
this.memoryLimit = 24
|
||||
}
|
||||
else {
|
||||
this.memoryLimit = wholeMatrixMemory
|
||||
}
|
||||
}
|
||||
|
||||
trait LongRunTime extends CommandLineFunction {
|
||||
if (longJobQueue != "")
|
||||
this.jobQueue = longJobQueue
|
||||
}
|
||||
|
||||
def script = {
|
||||
val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile)
|
||||
add(prepTargets)
|
||||
|
||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||
this.intervals :+= prepTargets.out
|
||||
this.jarFile = qscript.gatkJarFile
|
||||
this.reference_sequence = qscript.referenceFile
|
||||
this.logging_level = "INFO"
|
||||
}
|
||||
|
||||
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams))
|
||||
val samples: List[String] = sampleToBams.keys.toList
|
||||
Console.out.printf("Samples are %s%n", samples)
|
||||
|
||||
val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase)
|
||||
var docs: List[DoC] = List[DoC]()
|
||||
for (group <- groups) {
|
||||
Console.out.printf("Group is %s%n", group)
|
||||
docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
|
||||
}
|
||||
addAll(docs)
|
||||
|
||||
val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit
|
||||
add(mergeDepths)
|
||||
|
||||
var excludeTargets : List[File] = List[File]()
|
||||
if (minTargGC > 0 || maxTargGC < 1) {
|
||||
val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs
|
||||
calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX
|
||||
add(calcGCcontents)
|
||||
|
||||
val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC)
|
||||
add(excludeTargetsBasedOnGC)
|
||||
excludeTargets ::= excludeTargetsBasedOnGC.out
|
||||
}
|
||||
|
||||
class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction {
|
||||
@Input(doc="")
|
||||
var intervals: File = prepTargets.out
|
||||
|
||||
@Output(doc="")
|
||||
var out : File = new File(outFile)
|
||||
|
||||
val regFile : String = outputBase.getPath + ".targets.reg"
|
||||
val locDB : String = outputBase.getPath + ".targets.LOCDB"
|
||||
|
||||
val removeFiles = "rm -f " + regFile + " " + locDB
|
||||
val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile
|
||||
val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load"
|
||||
val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats"
|
||||
val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out
|
||||
|
||||
var command: String =
|
||||
removeFiles +
|
||||
" && " + createRegFile +
|
||||
" && " + createLOCDB +
|
||||
" && " + calcRepeatMaskedPercent +
|
||||
" && " + extractRepeatMaskedPercent
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command
|
||||
}
|
||||
|
||||
if (minTargRepeats > 0 || maxTargRepeats < 1) {
|
||||
val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX)
|
||||
add(calcRepeatComplexity)
|
||||
|
||||
val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats)
|
||||
add(excludeTargetsBasedOnRepeats)
|
||||
excludeTargets ::= excludeTargetsBasedOnRepeats.out
|
||||
}
|
||||
|
||||
val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets)
|
||||
add(filterCenterDepths)
|
||||
|
||||
val pca = new PCA(filterCenterDepths.filteredCentered)
|
||||
add(pca)
|
||||
|
||||
val normalize = new Normalize(pca)
|
||||
add(normalize)
|
||||
|
||||
val filterZscore = new FilterAndZscoreNormalized(normalize.normalized)
|
||||
add(filterZscore)
|
||||
|
||||
val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore)
|
||||
add(filterOriginal)
|
||||
|
||||
val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered)
|
||||
add(discover)
|
||||
|
||||
val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
|
||||
add(genotype)
|
||||
|
||||
if (genotypeSubsegments) {
|
||||
val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
|
||||
add(genotypeSegs)
|
||||
}
|
||||
}
|
||||
|
||||
class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction {
|
||||
@Input(doc="")
|
||||
var locus_value : File = locus_valueIn
|
||||
|
||||
@Output(doc="")
|
||||
var out : File = new File(outputBase.getPath + outSuffix)
|
||||
|
||||
def run = {
|
||||
var outWriter = new PrintWriter(new PrintStream(out))
|
||||
var elems = asScalaIterator(new XReadLines(locus_value))
|
||||
|
||||
while (elems.hasNext) {
|
||||
val line = elems.next
|
||||
val splitLine = line.split("\\s+")
|
||||
val locus = splitLine(0)
|
||||
val locValStr = splitLine(1)
|
||||
try {
|
||||
val locVal = locValStr.toDouble
|
||||
if (locVal < minVal || locVal > maxVal)
|
||||
outWriter.printf("%s%n", locus)
|
||||
}
|
||||
catch {
|
||||
case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus)
|
||||
case e: Exception => throw e
|
||||
}
|
||||
}
|
||||
|
||||
outWriter.close
|
||||
}
|
||||
}
|
||||
|
||||
class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val excludeTargets = excludeTargetsIn
|
||||
|
||||
@Output
|
||||
val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX)
|
||||
@Output
|
||||
val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX)
|
||||
@Output
|
||||
val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --matrix" +
|
||||
" -r " + input +
|
||||
" --centerData --centerType target" +
|
||||
" -o " + filteredCentered +
|
||||
" --outputExcludedTargets " + filteredTargets +
|
||||
" --outputExcludedSamples " + filteredSamples
|
||||
command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _)
|
||||
if (targetSampleFiltersString != "")
|
||||
command += " " + targetSampleFiltersString
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Filters samples and targets and then mean-centers the targets: " + command
|
||||
}
|
||||
|
||||
class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
val PCAbase: String = outputBase.getPath + ".RD_PCA"
|
||||
|
||||
@Output
|
||||
val outPC: File = new File(PCAbase + ".PC.txt")
|
||||
@Output
|
||||
val outPC_SD: File = new File(PCAbase + ".PC_SD.txt")
|
||||
@Output
|
||||
val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt")
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --PCA" +
|
||||
" -r " + input +
|
||||
" --PCAfiles " + PCAbase
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Runs PCA on mean-centered data: " + command
|
||||
}
|
||||
|
||||
class Normalize(pca: PCA) extends CommandLineFunction {
|
||||
@Input(doc = "")
|
||||
val input = pca.input
|
||||
|
||||
@Input(doc = "")
|
||||
val inPC = pca.outPC
|
||||
|
||||
@Input(doc = "")
|
||||
val inPC_SD = pca.outPC_SD
|
||||
|
||||
@Input(doc = "")
|
||||
val inPC_LOADINGS = pca.outPC_LOADINGS
|
||||
|
||||
@Output
|
||||
val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt")
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --normalize" +
|
||||
" -r " + input +
|
||||
" --PCAfiles " + pca.PCAbase +
|
||||
" --normalizeOutput " + normalized
|
||||
if (PCAnormalizeMethodString != "")
|
||||
command += " " + PCAnormalizeMethodString
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Normalizes mean-centered data using PCA information: " + command
|
||||
}
|
||||
|
||||
class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Output
|
||||
val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX)
|
||||
@Output
|
||||
val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX)
|
||||
@Output
|
||||
val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --matrix" +
|
||||
" -r " + input +
|
||||
" --centerData --centerType sample --zScoreData" +
|
||||
" -o " + filteredZscored +
|
||||
" --outputExcludedTargets " + filteredTargets +
|
||||
" --outputExcludedSamples " + filteredSamples
|
||||
if (targetSampleNormalizedFiltersString != "")
|
||||
command += " " + targetSampleNormalizedFiltersString
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command
|
||||
}
|
||||
|
||||
class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u))
|
||||
|
||||
@Input(doc = "")
|
||||
val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u))
|
||||
|
||||
@Output
|
||||
val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --matrix" +
|
||||
" -r " + input +
|
||||
targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) +
|
||||
sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) +
|
||||
" -o " + sameFiltered
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command
|
||||
}
|
||||
|
||||
class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val xhmmParams = xhmmParamsArg
|
||||
|
||||
@Input(doc = "")
|
||||
val origRD = origRDParam
|
||||
|
||||
@Output
|
||||
val xcnv: File = new File(outputBase.getPath + ".xcnv")
|
||||
|
||||
@Output
|
||||
val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv")
|
||||
|
||||
val posteriorsBase = outputBase.getPath
|
||||
|
||||
@Output
|
||||
val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt")
|
||||
|
||||
@Output
|
||||
val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt")
|
||||
|
||||
@Output
|
||||
val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt")
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --discover" +
|
||||
" -p " + xhmmParams +
|
||||
" -r " + input +
|
||||
" -R " + origRD +
|
||||
" -c " + xcnv +
|
||||
" -a " + aux_xcnv +
|
||||
" -s " + posteriorsBase +
|
||||
" " + discoverCommandLineParams
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Discovers CNVs in normalized data: " + command
|
||||
}
|
||||
|
||||
abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
|
||||
@Input(doc = "")
|
||||
val input = inputParam
|
||||
|
||||
@Input(doc = "")
|
||||
val xhmmParams = xhmmParamsArg
|
||||
|
||||
@Input(doc = "")
|
||||
val origRD = origRDParam
|
||||
|
||||
@Input(doc = "")
|
||||
val inXcnv = xcnv
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --genotype" +
|
||||
" -p " + xhmmParams +
|
||||
" -r " + input +
|
||||
" -g " + inXcnv +
|
||||
" -F " + referenceFile +
|
||||
" -R " + origRD +
|
||||
" " + genotypeCommandLineParams
|
||||
}
|
||||
|
||||
class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
|
||||
@Output
|
||||
val vcf: File = new File(outputBase.getPath + ".vcf")
|
||||
|
||||
command +=
|
||||
" -v " + vcf
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Genotypes discovered CNVs in all samples: " + command
|
||||
}
|
||||
|
||||
class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
|
||||
@Output
|
||||
val vcf: File = new File(outputBase.getPath + ".subsegments.vcf")
|
||||
|
||||
command +=
|
||||
" -v " + vcf +
|
||||
" --subsegments" +
|
||||
" --maxTargetsInSubsegment " + maxTargetsInSubsegment +
|
||||
" --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command
|
||||
}
|
||||
}
|
||||
|
|
@ -147,13 +147,13 @@ class GATKResourcesBundle extends QScript {
|
|||
//
|
||||
// example call set for wiki tutorial
|
||||
//
|
||||
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf",
|
||||
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf",
|
||||
"NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true))
|
||||
|
||||
//
|
||||
// Test BAM file, specific to each reference
|
||||
//
|
||||
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam",
|
||||
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam",
|
||||
"IGNORE", b37, false, false))
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -110,95 +110,103 @@ class QCommandLine extends CommandLineProgram with Logging {
|
|||
* functions, and then builds and runs a QGraph based on the dependencies.
|
||||
*/
|
||||
def execute = {
|
||||
ClassFieldCache.parsingEngine = this.parser
|
||||
var success = false
|
||||
var result = 1
|
||||
try {
|
||||
ClassFieldCache.parsingEngine = this.parser
|
||||
|
||||
if (settings.qSettings.runName == null)
|
||||
settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName)
|
||||
if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory))
|
||||
settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp")
|
||||
qGraph.initializeWithSettings(settings)
|
||||
if (settings.qSettings.runName == null)
|
||||
settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName)
|
||||
if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory))
|
||||
settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp")
|
||||
qGraph.initializeWithSettings(settings)
|
||||
|
||||
for (commandPlugin <- allCommandPlugins) {
|
||||
loadArgumentsIntoObject(commandPlugin)
|
||||
}
|
||||
|
||||
for (commandPlugin <- allCommandPlugins) {
|
||||
if (commandPlugin.statusMessenger != null)
|
||||
commandPlugin.statusMessenger.started()
|
||||
}
|
||||
|
||||
qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq
|
||||
|
||||
// TODO: Default command plugin argument?
|
||||
val remoteFileConverter = (
|
||||
for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null))
|
||||
yield commandPlugin.remoteFileConverter
|
||||
).headOption.getOrElse(null)
|
||||
|
||||
if (remoteFileConverter != null)
|
||||
loadArgumentsIntoObject(remoteFileConverter)
|
||||
|
||||
val allQScripts = qScriptPluginManager.createAllTypes()
|
||||
for (script <- allQScripts) {
|
||||
logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript])))
|
||||
loadArgumentsIntoObject(script)
|
||||
allCommandPlugins.foreach(_.initScript(script))
|
||||
// TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now.
|
||||
//if (settings.run)
|
||||
script.pullInputs()
|
||||
script.qSettings = settings.qSettings
|
||||
try {
|
||||
script.script()
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e)
|
||||
for (commandPlugin <- allCommandPlugins) {
|
||||
loadArgumentsIntoObject(commandPlugin)
|
||||
}
|
||||
|
||||
if (remoteFileConverter != null) {
|
||||
if (remoteFileConverter.convertToRemoteEnabled)
|
||||
script.mkRemoteOutputs(remoteFileConverter)
|
||||
}
|
||||
|
||||
script.functions.foreach(qGraph.add(_))
|
||||
logger.info("Added " + script.functions.size + " functions")
|
||||
}
|
||||
// Execute the job graph
|
||||
qGraph.run()
|
||||
|
||||
val functionsAndStatus = qGraph.getFunctionsAndStatus
|
||||
val success = qGraph.success
|
||||
|
||||
// walk over each script, calling onExecutionDone
|
||||
for (script <- allQScripts) {
|
||||
val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f))
|
||||
script.onExecutionDone(scriptFunctions, success)
|
||||
}
|
||||
|
||||
logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size))
|
||||
|
||||
// write the final complete job report
|
||||
logger.info("Writing final jobs report...")
|
||||
qGraph.writeJobsReport()
|
||||
|
||||
if (!success) {
|
||||
logger.info("Done with errors")
|
||||
qGraph.logFailed()
|
||||
for (commandPlugin <- allCommandPlugins)
|
||||
for (commandPlugin <- allCommandPlugins) {
|
||||
if (commandPlugin.statusMessenger != null)
|
||||
commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts))
|
||||
1
|
||||
} else {
|
||||
if (settings.run) {
|
||||
allQScripts.foreach(_.pushOutputs())
|
||||
for (commandPlugin <- allCommandPlugins)
|
||||
if (commandPlugin.statusMessenger != null) {
|
||||
val allInputs = allQScripts.map(_.remoteInputs)
|
||||
val allOutputs = allQScripts.map(_.remoteOutputs)
|
||||
commandPlugin.statusMessenger.done(allInputs, allOutputs)
|
||||
}
|
||||
commandPlugin.statusMessenger.started()
|
||||
}
|
||||
|
||||
qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq
|
||||
|
||||
// TODO: Default command plugin argument?
|
||||
val remoteFileConverter = (
|
||||
for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null))
|
||||
yield commandPlugin.remoteFileConverter
|
||||
).headOption.getOrElse(null)
|
||||
|
||||
if (remoteFileConverter != null)
|
||||
loadArgumentsIntoObject(remoteFileConverter)
|
||||
|
||||
val allQScripts = qScriptPluginManager.createAllTypes()
|
||||
for (script <- allQScripts) {
|
||||
logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript])))
|
||||
loadArgumentsIntoObject(script)
|
||||
allCommandPlugins.foreach(_.initScript(script))
|
||||
// TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now.
|
||||
//if (settings.run)
|
||||
script.pullInputs()
|
||||
script.qSettings = settings.qSettings
|
||||
try {
|
||||
script.script()
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e)
|
||||
}
|
||||
|
||||
if (remoteFileConverter != null) {
|
||||
if (remoteFileConverter.convertToRemoteEnabled)
|
||||
script.mkRemoteOutputs(remoteFileConverter)
|
||||
}
|
||||
|
||||
script.functions.foreach(qGraph.add(_))
|
||||
logger.info("Added " + script.functions.size + " functions")
|
||||
}
|
||||
// Execute the job graph
|
||||
qGraph.run()
|
||||
|
||||
val functionsAndStatus = qGraph.getFunctionsAndStatus
|
||||
|
||||
// walk over each script, calling onExecutionDone
|
||||
for (script <- allQScripts) {
|
||||
val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f))
|
||||
script.onExecutionDone(scriptFunctions, success)
|
||||
}
|
||||
|
||||
logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size))
|
||||
|
||||
// write the final complete job report
|
||||
logger.info("Writing final jobs report...")
|
||||
qGraph.writeJobsReport()
|
||||
|
||||
if (qGraph.success) {
|
||||
if (settings.run) {
|
||||
allQScripts.foreach(_.pushOutputs())
|
||||
for (commandPlugin <- allCommandPlugins)
|
||||
if (commandPlugin.statusMessenger != null) {
|
||||
val allInputs = allQScripts.map(_.remoteInputs)
|
||||
val allOutputs = allQScripts.map(_.remoteOutputs)
|
||||
commandPlugin.statusMessenger.done(allInputs, allOutputs)
|
||||
}
|
||||
}
|
||||
success = true
|
||||
result = 0
|
||||
}
|
||||
} finally {
|
||||
if (!success) {
|
||||
logger.info("Done with errors")
|
||||
qGraph.logFailed()
|
||||
if (settings.run) {
|
||||
for (commandPlugin <- allCommandPlugins)
|
||||
if (commandPlugin.statusMessenger != null)
|
||||
commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts))
|
||||
}
|
||||
}
|
||||
0
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -124,49 +124,26 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon
|
|||
}
|
||||
|
||||
/**
|
||||
* Pull all remote files to the local disk.
|
||||
* Pull all remote files to the local disk
|
||||
*/
|
||||
def pullInputs() {
|
||||
val inputs = ClassFieldCache.getFieldFiles(this, inputFields)
|
||||
for (remoteFile <- filterRemoteFiles(inputs)) {
|
||||
logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription))
|
||||
remoteFile.pullToLocal()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Push all remote files from the local disk.
|
||||
* Push all remote files from the local disk
|
||||
*/
|
||||
def pushOutputs() {
|
||||
val outputs = ClassFieldCache.getFieldFiles(this, outputFields)
|
||||
for (remoteFile <- filterRemoteFiles(outputs)) {
|
||||
logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription))
|
||||
remoteFile.pushToRemote()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List out the remote outputs
|
||||
* @return the RemoteFile outputs by argument source
|
||||
* @return the inputs or null if there are no inputs
|
||||
*/
|
||||
def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields))
|
||||
def remoteInputs: AnyRef = null
|
||||
|
||||
/**
|
||||
* List out the remote outputs
|
||||
* @return the RemoteFile outputs by argument source
|
||||
* @return the outputs or null if there are no outputs
|
||||
*/
|
||||
def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields))
|
||||
|
||||
private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = {
|
||||
remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap
|
||||
}
|
||||
|
||||
private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = {
|
||||
fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap
|
||||
}
|
||||
|
||||
private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] =
|
||||
fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile])
|
||||
def remoteOutputs: AnyRef = null
|
||||
|
||||
/** The complete list of fields. */
|
||||
def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import org.broadinstitute.sting.queue.util.RemoteFile
|
|||
*/
|
||||
trait QStatusMessenger {
|
||||
def started()
|
||||
def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]])
|
||||
def done(inputs: Seq[_], outputs: Seq[_])
|
||||
def exit(message: String)
|
||||
|
||||
def started(job: String)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,131 @@
|
|||
package org.broadinstitute.sting.queue.util
|
||||
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK}
|
||||
import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType
|
||||
import org.broadinstitute.sting.commandline.{Input, Gather, Output}
|
||||
import org.broadinstitute.sting.queue.function.CommandLineFunction
|
||||
import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils
|
||||
|
||||
package object DoC {
|
||||
class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
|
||||
val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"
|
||||
|
||||
// So that the output files of this DoC run get deleted once they're used further downstream:
|
||||
this.isIntermediate = true
|
||||
|
||||
this.analysis_type = "DepthOfCoverage"
|
||||
|
||||
this.input_file = bams
|
||||
|
||||
this.downsample_to_coverage = Some(MAX_DEPTH)
|
||||
this.downsampling_type = DownsampleType.BY_SAMPLE
|
||||
|
||||
this.scatterCount = scatterCountInput
|
||||
this.scatterClass = classOf[IntervalScatterFunction]
|
||||
|
||||
// HACK for DoC to work properly within Queue:
|
||||
@Output
|
||||
@Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
|
||||
var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX)
|
||||
|
||||
override def commandLine = super.commandLine +
|
||||
" --omitDepthOutputAtEachBase" +
|
||||
" --omitLocusTable" +
|
||||
" --minMappingQuality " + minMappingQuality +
|
||||
" --minBaseQuality " + minBaseQuality +
|
||||
optional("--countType", countType, spaceSeparated=true, escape=true, format="%s") +
|
||||
" --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS +
|
||||
(if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") +
|
||||
" --includeRefNSites" +
|
||||
" -o " + DoC_output
|
||||
|
||||
override def shortDescription = "DoC: " + DoC_output
|
||||
}
|
||||
|
||||
class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality: Int, minBaseQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
|
||||
// HACK for DoC to work properly within Queue:
|
||||
@Output
|
||||
@Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
|
||||
var outPrefix = DoC_output
|
||||
|
||||
override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "")
|
||||
}
|
||||
|
||||
def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = {
|
||||
var l: List[Group] = Nil
|
||||
|
||||
var remaining = samples
|
||||
var subsamples: List[String] = Nil
|
||||
var count = 1
|
||||
|
||||
while (!remaining.isEmpty) {
|
||||
val splitRes = (remaining splitAt samplesPerJob)
|
||||
subsamples = splitRes._1
|
||||
remaining = splitRes._2
|
||||
l ::= new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams))
|
||||
count = count + 1
|
||||
}
|
||||
|
||||
return l
|
||||
}
|
||||
|
||||
// A group has a list of samples and bam files to use for DoC
|
||||
class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) {
|
||||
// getName() just includes the file name WITHOUT the path:
|
||||
val groupOutputName = name + "." + outputBase.getName
|
||||
|
||||
// Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! :
|
||||
//def DoC_output = new File(outputBase.getParentFile(), groupOutputName)
|
||||
|
||||
def DoC_output = new File(groupOutputName)
|
||||
|
||||
override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams)
|
||||
}
|
||||
|
||||
class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction {
|
||||
@Input(doc = "")
|
||||
var inputDoCfiles: List[File] = DoCsToCombine
|
||||
|
||||
@Output
|
||||
val mergedDoC: File = new File(outFile)
|
||||
var command: String =
|
||||
xhmmExec + " --mergeGATKdepths" +
|
||||
inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) +
|
||||
" --columnSuffix " + columnSuffix +
|
||||
" -o " + mergedDoC
|
||||
if (sampleIDsMap != "")
|
||||
command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn
|
||||
rdPrecisionArg match {
|
||||
case Some(rdPrecision) => {
|
||||
command += " --rdPrecision " + rdPrecision
|
||||
}
|
||||
case None => {}
|
||||
}
|
||||
if (outputTargetsBySamples)
|
||||
command += " --outputTargetsBySamples"
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Combines DoC outputs for multiple samples (at same loci): " + command
|
||||
}
|
||||
|
||||
class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction {
|
||||
@Input(doc = "List of files containing targeted intervals to be prepared and merged")
|
||||
var inIntervals: List[File] = intervalsIn
|
||||
|
||||
@Output(doc = "The merged intervals file to write to")
|
||||
var out: File = new File(outIntervals)
|
||||
|
||||
var command: String =
|
||||
xhmmExec + " --prepareTargets" +
|
||||
" -F " + referenceFile +
|
||||
inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) +
|
||||
" --mergedTargets " + out
|
||||
|
||||
def commandLine = command
|
||||
|
||||
override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command
|
||||
}
|
||||
}
|
||||
|
|
@ -26,36 +26,31 @@ object VCF_BAM_utilities {
|
|||
case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
|
||||
}
|
||||
|
||||
def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
|
||||
case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
|
||||
|
||||
case x :: y =>
|
||||
val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y)
|
||||
val bamSamples: List[String] = getSamplesInBAM(x)
|
||||
def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = {
|
||||
var m = scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
|
||||
|
||||
for (bam <- bams) {
|
||||
val bamSamples: List[String] = getSamplesInBAM(bam)
|
||||
for (s <- bamSamples) {
|
||||
if (!m.contains(s))
|
||||
m += s -> scala.collection.mutable.Set.empty[File]
|
||||
|
||||
m(s) = m(s) + x
|
||||
m(s) += bam
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
|
||||
var s = scala.collection.mutable.Set.empty[File]
|
||||
|
||||
def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match {
|
||||
case Nil => scala.collection.mutable.Set.empty[File]
|
||||
|
||||
case x :: y =>
|
||||
var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
|
||||
if (sampleToBams.contains(x))
|
||||
bamsForSampleX = sampleToBams(x)
|
||||
return bamsForSampleX ++ findBAMsForSamplesHelper(y)
|
||||
for (sample <- samples) {
|
||||
if (sampleToBams.contains(sample))
|
||||
s ++= sampleToBams(sample)
|
||||
}
|
||||
|
||||
val l: List[File] = Nil
|
||||
return l ++ findBAMsForSamplesHelper(samples)
|
||||
return l ++ s
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ import org.testng.annotations.Test
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class DataProcessingPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testSimpleBAM {
|
||||
val projectName = "test1"
|
||||
val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam"
|
||||
|
|
@ -45,7 +45,7 @@ class DataProcessingPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testBWAPEBAM {
|
||||
val projectName = "test2"
|
||||
val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam"
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ import org.testng.annotations.Test
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class PacbioProcessingPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testPacbioProcessingPipeline {
|
||||
val testOut = "exampleBAM.recal.bam"
|
||||
val spec = new PipelineTestSpec
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class DevNullOutputPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testDevNullOutput() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "devnulloutput"
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class ExampleCountLociPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testCountLoci() {
|
||||
val testOut = "count.out"
|
||||
val spec = new PipelineTestSpec
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class ExampleCountReadsPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testCountReads() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "countreads"
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class ExampleReadFilterPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testExampleReadFilter() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "examplereadfilter"
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class ExampleRetryMemoryLimitPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testRetryMemoryLimit() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "RetryMemoryLimit"
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
|||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class ExampleUnifiedGenotyperPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testUnifiedGenotyper() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "unifiedgenotyper"
|
||||
|
|
@ -51,7 +51,7 @@ class ExampleUnifiedGenotyperPipelineTest {
|
|||
Array("vcf_intervals", BaseTest.validationDataLocation + "intervalTest.1.vcf")
|
||||
).asInstanceOf[Array[Array[Object]]]
|
||||
|
||||
@Test(dataProvider = "ugIntervals")
|
||||
@Test(dataProvider = "ugIntervals", timeOut=36000000)
|
||||
def testUnifiedGenotyperWithIntervals(intervalsName: String, intervalsPath: String) {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "unifiedgenotyper_with_" + intervalsName
|
||||
|
|
@ -64,7 +64,7 @@ class ExampleUnifiedGenotyperPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testUnifiedGenotyperNoGCOpt() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "unifiedgenotyper_no_gc_opt"
|
||||
|
|
@ -80,7 +80,7 @@ class ExampleUnifiedGenotyperPipelineTest {
|
|||
@DataProvider(name="resMemReqParams")
|
||||
def getResMemReqParam = Array(Array("mem_free"), Array("virtual_free")).asInstanceOf[Array[Array[Object]]]
|
||||
|
||||
@Test(dataProvider = "resMemReqParams")
|
||||
@Test(dataProvider = "resMemReqParams", timeOut=36000000)
|
||||
def testUnifiedGenotyperResMemReqParam(reqParam: String) {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "unifiedgenotyper_" + reqParam
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ import org.testng.annotations.Test
|
|||
import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
|
||||
|
||||
class HelloWorldPipelineTest {
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorld() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorld"
|
||||
|
|
@ -37,7 +37,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithRunName() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithRunName"
|
||||
|
|
@ -47,7 +47,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithMemoryLimit() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldMemoryLimit"
|
||||
|
|
@ -57,7 +57,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithPriority() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithPriority"
|
||||
|
|
@ -67,7 +67,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithLsfResource() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithLsfResource"
|
||||
|
|
@ -77,7 +77,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithLsfResourceAndMemoryLimit() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithLsfResourceAndMemoryLimit"
|
||||
|
|
@ -87,7 +87,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithLsfEnvironment() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithLsfEnvironment"
|
||||
|
|
@ -97,7 +97,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithGridEngineResource() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithGridEngineResource"
|
||||
|
|
@ -107,7 +107,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithGridEngineResourceAndMemoryLimit() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit"
|
||||
|
|
@ -117,7 +117,7 @@ class HelloWorldPipelineTest {
|
|||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(timeOut=36000000)
|
||||
def testHelloWorldWithGridEngineEnvironment() {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithGridEngineEnvironment"
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1,3 +1,3 @@
|
|||
<ivy-module version="1.0">
|
||||
<info organisation="org.broad" module="tribble" revision="110" status="integration" />
|
||||
<info organisation="org.broad" module="tribble" revision="119" status="integration" />
|
||||
</ivy-module>
|
||||
Loading…
Reference in New Issue