Merge branch 'master' of github.com:broadinstitute/gsa-unstable

This commit is contained in:
Menachem Fromer 2012-11-20 12:19:58 -05:00
commit 9111966261
88 changed files with 2142 additions and 1364 deletions

View File

@ -327,14 +327,18 @@
<!-- INIT OVERRIDES: call these targets BEFORE init to override build defaults -->
<target name="init.publiconly">
<target name="init.build.publiconly">
<property name="build.target" value="public" />
</target>
<target name="init.publicprotectedonly">
<target name="init.build.publicprotectedonly">
<property name="build.target" value="protected" />
</target>
<target name="init.build.all">
<property name="build.target" value="all" />
</target>
<target name="init.javaonly">
<property name="compile.scala" value="false" />
</target>
@ -842,19 +846,23 @@
<!-- Release-related tasks -->
<!-- ******************************************************************************** -->
<target name="init.buildgatkfull" depends="init.publicprotectedonly, init.javaonly">
<target name="init.executable.gatkfull" depends="init.build.publicprotectedonly, init.javaonly">
<property name="executable" value="GenomeAnalysisTK" />
</target>
<target name="init.buildgatklite" depends="init.publiconly, init.javaonly">
<target name="init.executable.gatklite" depends="init.build.publiconly, init.javaonly">
<property name="executable" value="GenomeAnalysisTKLite" />
</target>
<target name="init.buildqueuefull" depends="init.publicprotectedonly, init.javaandscala">
<target name="init.executable.queueall" depends="init.build.all, init.javaandscala">
<property name="executable" value="Queue" />
</target>
<target name="init.buildqueuelite" depends="init.publiconly, init.javaandscala">
<target name="init.executable.queuefull" depends="init.build.publicprotectedonly, init.javaandscala">
<property name="executable" value="Queue" />
</target>
<target name="init.executable.queuelite" depends="init.build.publiconly, init.javaandscala">
<property name="executable" value="QueueLite" />
</target>
@ -891,7 +899,7 @@
<!-- Build a package consisting of all supporting files. Don't call this target directly. Call one of the specific packaging targets below -->
<target name="package" depends="dist,stage,require.executable" description="bundle up an executable for distribution">
<target name="package" depends="require.clean,dist,stage,require.executable" description="bundle up an executable for distribution">
<mkdir dir="${package.output.dir}" />
<xslt destdir="${package.output.dir}" style="${package.xml.dir}/CreatePackager.xsl" useImplicitFileset="false">
<flattenmapper/>
@ -906,13 +914,15 @@
</target>
<!-- Package specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
<target name="package.gatk.full" depends="init.buildgatkfull,package" />
<target name="package.gatk.full" depends="init.executable.gatkfull,package" />
<target name="package.gatk.lite" depends="init.buildgatklite,package" />
<target name="package.gatk.lite" depends="init.executable.gatklite,package" />
<target name="package.queue.full" depends="init.buildqueuefull,package" />
<target name="package.queue.all" depends="init.executable.queueall,package" />
<target name="package.queue.lite" depends="init.buildqueuelite,package" />
<target name="package.queue.full" depends="init.executable.queuefull,package" />
<target name="package.queue.lite" depends="init.executable.queuelite,package" />
<!-- Release a build. Don't call this target directly. Call one of the specific release targets below -->
@ -975,6 +985,8 @@
<target name="mvninstall.gatk.lite" depends="package.gatk.lite,mvninstall" />
<target name="mvninstall.queue.all" depends="package.queue.all,mvninstall" />
<target name="mvninstall.queue.full" depends="package.queue.full,mvninstall" />
<target name="mvninstall.queue.lite" depends="package.queue.lite,mvninstall" />
@ -1011,6 +1023,24 @@
<delete dir="${pipelinetest.dir}"/>
</target>
<!-- Depend on this target if your target requires a clean working directory but you don't want to depend on clean directly -->
<target name="require.clean">
<condition property="not.clean">
<or>
<available file="${build.dir}" />
<available file="${lib.dir}" />
<available file="${contract.dump.dir}" />
<available file="${staging.dir}" />
<available file="${dist.dir}" />
<available file="${pipelinetest.dir}" />
<available file="${javadoc.dir}" />
<available file="${scaladoc.dir}" />
<available file="${gatkdocs.dir}" />
</or>
</condition>
<fail message="Selected build target requires a clean working directory. Run ant clean and then try again." if="not.clean" />
</target>
<!-- ******************************************************************************** -->
<!-- gsalib -->
@ -1174,14 +1204,16 @@
<echo message="" />
<echo message="Sting: Running @{testtype} test cases!"/>
<!-- no test is allowed to run for more than 10 hours -->
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
<testng outputDir="@{outputdir}"
classpathref="${testng.classpath}"
haltOnFailure="false" failureProperty="test.failure"
verbose="2"
timeout="36000000"
workingDir="${basedir}"
useDefaultListeners="false"
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
<jvmarg value="-Xmx${test.maxmemory}" />
<jvmarg value="-ea" />
<jvmarg value="-Djava.awt.headless=true" />

View File

@ -67,7 +67,6 @@ public class AlleleBiasedDownsamplingUtils {
alleleStratifiedElements[baseIndex].add(pe);
}
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
@ -78,12 +77,21 @@ public class AlleleBiasedDownsamplingUtils {
}
});
// make a listing of allele counts
final int[] alleleCounts = new int[4];
for ( int i = 0; i < 4; i++ )
alleleCounts[i] = alleleStratifiedElements[i].size();
// do smart down-sampling
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
for ( int i = 0; i < 4; i++ ) {
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
if ( alleleList.size() <= numReadsToRemove )
logAllElements(alleleList, log);
// if we don't need to remove any reads, keep them all
if ( alleleList.size() <= targetAlleleCounts[i] )
elementsToKeep.addAll(alleleList);
else
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
}
// clean up pointers so memory can be garbage collected if needed
@ -93,6 +101,66 @@ public class AlleleBiasedDownsamplingUtils {
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
}
private static int scoreAlleleCounts(final int[] alleleCounts) {
if ( alleleCounts.length < 2 )
return 0;
// sort the counts (in ascending order)
final int[] alleleCountsCopy = alleleCounts.clone();
Arrays.sort(alleleCountsCopy);
final int maxCount = alleleCountsCopy[alleleCounts.length - 1];
final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2];
int remainderCount = 0;
for ( int i = 0; i < alleleCounts.length - 2; i++ )
remainderCount += alleleCountsCopy[i];
// try to get the best score:
// - in the het case the counts should be equal with nothing else
// - in the hom case the non-max should be zero
return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount));
}
/**
* Computes an allele biased version of the given pileup
*
* @param alleleCounts the original pileup
* @param numReadsToRemove fraction of total reads to remove per allele
* @return allele biased pileup
*/
protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
final int numAlleles = alleleCounts.length;
int maxScore = scoreAlleleCounts(alleleCounts);
int[] alleleCountsOfMax = alleleCounts;
final int numReadsToRemovePerAllele = numReadsToRemove / 2;
for ( int i = 0; i < numAlleles; i++ ) {
for ( int j = i; j < numAlleles; j++ ) {
final int[] newCounts = alleleCounts.clone();
// split these cases so we don't lose on the floor (since we divided by 2)
if ( i == j ) {
newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove);
} else {
newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele);
newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele);
}
final int score = scoreAlleleCounts(newCounts);
if ( score < maxScore ) {
maxScore = score;
alleleCountsOfMax = newCounts;
}
}
}
return alleleCountsOfMax;
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
*
@ -102,7 +170,15 @@ public class AlleleBiasedDownsamplingUtils {
* @return the list of pileup elements TO KEEP
*/
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
if ( numElementsToRemove == 0 )
return elements;
final int pileupSize = elements.size();
if ( numElementsToRemove == pileupSize ) {
logAllElements(elements, log);
return new ArrayList<PileupElement>(0);
}
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
@ -132,15 +208,25 @@ public class AlleleBiasedDownsamplingUtils {
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
totalReads += reads.size();
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
int numReadsToRemove = (int)(totalReads * downsamplingFraction);
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove * alleleReadMap.size());
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) {
if ( reads.size() <= numReadsToRemove ) {
readsToRemove.addAll(reads);
logAllReads(reads, log);
} else {
readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
// make a listing of allele counts
final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
final int numAlleles = alleles.size();
final int[] alleleCounts = new int[numAlleles];
for ( int i = 0; i < numAlleles; i++ )
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
// do smart down-sampling
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
for ( int i = 0; i < numAlleles; i++ ) {
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
if ( alleleBin.size() > targetAlleleCounts[i] ) {
readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
}
}
@ -156,13 +242,22 @@ public class AlleleBiasedDownsamplingUtils {
* @return the list of pileup elements TO REMOVE
*/
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
if ( numElementsToRemove == 0 )
return readsToRemove;
final int pileupSize = reads.size();
if ( numElementsToRemove == pileupSize ) {
logAllReads(reads, log);
return reads;
}
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(pileupSize - numElementsToRemove);
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) ) {
final GATKSAMRecord read = reads.get(i);

View File

@ -39,57 +39,12 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
// optimization: only allocate temp arrays once per thread
private final ThreadLocal<byte[]> threadLocalTempQualArray = new ThreadLocalArray<byte[]>(EventType.values().length, byte.class);
private final ThreadLocal<boolean[]> threadLocalTempErrorArray = new ThreadLocalArray<boolean[]>(EventType.values().length, boolean.class);
private final ThreadLocal<double[]> threadLocalTempFractionalErrorArray = new ThreadLocalArray<double[]>(EventType.values().length, double.class);
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
super.initialize(covariates, recalibrationTables);
}
/**
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches for all three
* categories (mismatches, insertions and deletions).
*
* @param pileupElement The pileup element to update
* @param refBase The reference base at this locus
*/
@Override
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
final int offset = pileupElement.getOffset();
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
byte[] tempQualArray = threadLocalTempQualArray.get();
boolean[] tempErrorArray = threadLocalTempErrorArray.get();
tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual();
tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase);
tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual();
tempErrorArray[EventType.BASE_INSERTION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterInsertion() : pileupElement.isBeforeInsertion();
tempQualArray[EventType.BASE_DELETION.index] = pileupElement.getBaseDeletionQual();
tempErrorArray[EventType.BASE_DELETION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterDeletedBase() : pileupElement.isBeforeDeletedBase();
for (final EventType eventType : EventType.values()) {
final int[] keys = readCovariates.getKeySet(offset, eventType);
final int eventIndex = eventType.index;
final byte qual = tempQualArray[eventIndex];
final boolean isError = tempErrorArray[eventIndex];
// TODO: should this really be combine rather than increment?
combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
for (int i = 2; i < covariates.length; i++) {
if (keys[i] < 0)
continue;
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
}
}
}
@Override
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {

View File

@ -0,0 +1,59 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.GenomeLocComparator;
import java.util.Collection;
import java.util.TreeSet;
/**
* A stash of regions that must be kept uncompressed in all samples
*
* In general, these are regions that were kept uncompressed by a tumor sample and we want to force
* all other samples (normals and/or tumors) to also keep these regions uncompressed
*
* User: carneiro
* Date: 10/15/12
* Time: 4:08 PM
*/
public class CompressionStash extends TreeSet<SimpleGenomeLoc> {
public CompressionStash() {
super(new GenomeLocComparator());
}
/**
* Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc
* in the stash.
*
* @param insertLoc the new loc to be inserted
* @return true if the loc, or it's merged version, wasn't present in the list before.
*/
@Override
public boolean add(SimpleGenomeLoc insertLoc) {
TreeSet<SimpleGenomeLoc> removedLocs = new TreeSet<SimpleGenomeLoc>();
for (SimpleGenomeLoc existingLoc : this) {
if (existingLoc.isPast(insertLoc)) {
break; // if we're past the loc we're done looking for overlaps.
}
if (existingLoc.equals(insertLoc)) {
return false; // if this loc was already present in the stash, we don't need to insert it.
}
if (existingLoc.contiguousP(insertLoc)) {
removedLocs.add(existingLoc); // list the original loc for merging
}
}
for (SimpleGenomeLoc loc : removedLocs) {
this.remove(loc); // remove all locs that will be merged
}
removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged
return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash
}
@Override
public boolean addAll(Collection<? extends SimpleGenomeLoc> locs) {
boolean result = false;
for (SimpleGenomeLoc loc : locs) {
result |= this.add(loc);
}
return result;
}
}

View File

@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.Set;
import java.util.TreeSet;
/*
@ -41,7 +42,7 @@ import java.util.TreeSet;
*
* @author depristo
*/
public class MultiSampleCompressor implements Compressor {
public class MultiSampleCompressor {
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
@ -63,21 +64,36 @@ public class MultiSampleCompressor implements Compressor {
}
}
@Override
public Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
String sample = read.getReadGroup().getSample();
SingleSampleCompressor compressor = compressorsPerSample.get(sample);
public Set<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
String sampleName = read.getReadGroup().getSample();
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
if ( compressor == null )
throw new ReviewedStingException("No compressor for sample " + sample);
return compressor.addAlignment(read);
throw new ReviewedStingException("No compressor for sample " + sampleName);
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read);
Set<GATKSAMRecord> reads = readsAndStash.getFirst();
CompressionStash regions = readsAndStash.getSecond();
reads.addAll(closeVariantRegionsInAllSamples(regions));
return reads;
}
@Override
public Iterable<GATKSAMRecord> close() {
SortedSet<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
for ( SingleSampleCompressor comp : compressorsPerSample.values() )
for ( GATKSAMRecord read : comp.close() )
reads.add(read);
public Set<GATKSAMRecord> close() {
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close();
reads = readsAndStash.getFirst();
}
return reads;
}
private Set<GATKSAMRecord> closeVariantRegionsInAllSamples(CompressionStash regions) {
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
if (!regions.isEmpty()) {
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
reads.addAll(sample.closeVariantRegions(regions));
}
}
return reads;
}
}

View File

@ -25,6 +25,9 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.util.SequenceUtil;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
@ -45,6 +48,7 @@ import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -86,7 +90,8 @@ import java.util.*;
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
@Output
private StingSAMFileWriter out;
private StingSAMFileWriter out = null;
private SAMFileWriter writerToUse = null;
/**
* The number of bases to keep around mismatches (potential variation)
@ -196,6 +201,10 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
@Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
private int nContigs = 2;
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
private boolean nwayout = false;
@Hidden
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
private int debugLevel = 0;
@ -222,9 +231,12 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
CompressionStash compressionStash = new CompressionStash();
SortedSet<GenomeLoc> intervalList;
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
/**
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
@ -240,10 +252,23 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
if (toolkit.getIntervals() != null)
intervalList.addAll(toolkit.getIntervals());
if (!NO_PG_TAG)
Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
else
// todo -- rework the whole NO_PG_TAG thing
final boolean preSorted = true;
final boolean indexOnTheFly = true;
final boolean keep_records = true;
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
if (nwayout) {
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true);
}
else {
writerToUse = out;
out.setPresorted(false);
if (!NO_PG_TAG) {
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
}
}
}
/**
@ -384,6 +409,9 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
// output any remaining reads in the compressor
for (GATKSAMRecord read : stash.close())
outputRead(read);
if (nwayout)
writerToUse.close();
}
/**
@ -552,7 +580,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
if (!DONT_COMPRESS_READ_NAMES)
compressReadName(read);
out.addAlignment(read);
writerToUse.addAlignment(read);
}
/**

View File

@ -1,8 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Set;
import java.util.TreeSet;
/**
@ -10,7 +12,7 @@ import java.util.TreeSet;
* @author carneiro, depristo
* @version 3.0
*/
public class SingleSampleCompressor implements Compressor {
public class SingleSampleCompressor {
final private int contextSize;
final private int downsampleCoverage;
final private int minMappingQuality;
@ -24,6 +26,7 @@ public class SingleSampleCompressor implements Compressor {
private SlidingWindow slidingWindow;
private int slidingWindowCounter;
public static Pair<Set<GATKSAMRecord>, CompressionStash> emptyPair = new Pair<Set<GATKSAMRecord>,CompressionStash>(new TreeSet<GATKSAMRecord>(), new CompressionStash());
public SingleSampleCompressor(final int contextSize,
final int downsampleCoverage,
@ -46,12 +49,9 @@ public class SingleSampleCompressor implements Compressor {
this.allowPolyploidReduction = allowPolyploidReduction;
}
/**
* @{inheritDoc}
*/
@Override
public Iterable<GATKSAMRecord> addAlignment( GATKSAMRecord read ) {
TreeSet<GATKSAMRecord> result = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
public Pair<Set<GATKSAMRecord>, CompressionStash> addAlignment( GATKSAMRecord read ) {
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
CompressionStash stash = new CompressionStash();
int readOriginalStart = read.getUnclippedStart();
// create a new window if:
@ -60,7 +60,9 @@ public class SingleSampleCompressor implements Compressor {
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
// close the current sliding window
result.addAll(slidingWindow.close());
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close();
reads = readsAndStash.getFirst();
stash = readsAndStash.getSecond();
slidingWindow = null; // so we create a new one on the next if
}
@ -69,13 +71,16 @@ public class SingleSampleCompressor implements Compressor {
slidingWindowCounter++;
}
result.addAll(slidingWindow.addRead(read));
return result;
stash.addAll(slidingWindow.addRead(read));
return new Pair<Set<GATKSAMRecord>, CompressionStash>(reads, stash);
}
@Override
public Iterable<GATKSAMRecord> close() {
return (slidingWindow != null) ? slidingWindow.close() : new TreeSet<GATKSAMRecord>();
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
}
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
return slidingWindow.closeVariantRegions(regions);
}
}

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -57,6 +58,8 @@ public class SlidingWindow {
private boolean allowPolyploidReductionInGeneral;
private static CompressionStash emptyRegions = new CompressionStash();
/**
* The types of synthetic reads to use in the finalizeAndAdd method
*/
@ -137,7 +140,7 @@ public class SlidingWindow {
* @param read the read
* @return a list of reads that have been finished by sliding the window.
*/
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
public CompressionStash addRead(GATKSAMRecord read) {
addToHeader(windowHeader, read); // update the window header counts
readsInWindow.add(read); // add read to sliding reads
return slideWindow(read.getUnclippedStart());
@ -151,8 +154,9 @@ public class SlidingWindow {
* @param variantSite boolean array with true marking variant regions
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
*/
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) {
boolean foundStart = false;
final int windowHeaderStart = getStartLocation(windowHeader);
int variantRegionStartIndex = 0;
for (int i=from; i<to; i++) {
if (variantSite[i] && !foundStart) {
@ -160,10 +164,12 @@ public class SlidingWindow {
foundStart = true;
}
else if(!variantSite[i] && foundStart) {
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
return(new SimpleGenomeLoc(contig, contigIndex, windowHeaderStart + variantRegionStartIndex, windowHeaderStart + i - 1, true));
}
}
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
final int refStart = windowHeaderStart + variantRegionStartIndex;
final int refStop = windowHeaderStart + to - 1;
return (foundStart && forceClose) ? new SimpleGenomeLoc(contig, contigIndex, refStart, refStop, true) : null;
}
/**
@ -172,25 +178,25 @@ public class SlidingWindow {
* @param from beginning window header index of the search window (inclusive)
* @param to end window header index of the search window (exclusive)
* @param variantSite boolean array with true marking variant regions
* @return a list with start/stops of variant regions following getNextVariantRegion description
* @return a list with start/stops of variant regions following findNextVariantRegion description
*/
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
private CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean forceClose) {
CompressionStash regions = new CompressionStash();
int index = from;
while(index < to) {
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
SimpleGenomeLoc result = findNextVariantRegion(index, to, variantSite, forceClose);
if (result == null)
break;
regions.add(result);
if (result.getSecond() < 0)
if (!result.isFinished())
break;
index = result.getSecond() + 1;
index = result.getStop() + 1;
}
return regions;
}
/**
* Determines if the window can be slid given the new incoming read.
*
@ -201,25 +207,25 @@ public class SlidingWindow {
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
* @return all reads that have fallen to the left of the sliding window after the slide
*/
protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
protected CompressionStash slideWindow(final int incomingReadUnclippedStart) {
final int windowHeaderStartLocation = getStartLocation(windowHeader);
CompressionStash regions = emptyRegions;
boolean forceClose = true;
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
markSites(incomingReadUnclippedStart);
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet());
finalizedReads = closeVariantRegions(regions, false);
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
readsInWindow.pollFirst();
}
regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
}
return finalizedReads;
// todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
readsInWindow.pollFirst();
}
return regions;
}
@ -623,26 +629,27 @@ public class SlidingWindow {
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
return result; // finalized reads will be downsampled if necessary
return result; // finalized reads will be downsampled if necessary
}
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
TreeSet<GATKSAMRecord> allReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
if (!regions.isEmpty()) {
int lastStop = -1;
for (Pair<Integer, Integer> region : regions) {
int start = region.getFirst();
int stop = region.getSecond();
if (stop < 0 && forceClose)
stop = windowHeader.size() - 1;
if (stop >= 0) {
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
int windowHeaderStart = getStartLocation(windowHeader);
for (SimpleGenomeLoc region : regions) {
if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) {
int start = region.getStart() - windowHeaderStart;
int stop = region.getStop() - windowHeaderStart;
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
lastStop = stop;
}
}
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
windowHeader.remove();
}
return allReads;
}
@ -676,23 +683,24 @@ public class SlidingWindow {
*
* @return All reads generated
*/
public List<GATKSAMRecord> close() {
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
// mark variant regions
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
Set<GATKSAMRecord> finalizedReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
CompressionStash regions = new CompressionStash();
boolean forceCloseUnfinishedRegions = true;
if (!windowHeader.isEmpty()) {
markSites(getStopLocation(windowHeader) + 1);
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
finalizedReads = closeVariantRegions(regions, true);
regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
finalizedReads = closeVariantRegions(regions);
if (!windowHeader.isEmpty()) {
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
}
}
return finalizedReads;
return new Pair<Set<GATKSAMRecord>, CompressionStash>(finalizedReads, regions);
}
/**

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -41,7 +40,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
@ -212,7 +214,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
private VariantAnnotatorEngine annotationEngine;
// fasta reference reader to supplement the edges of the reference sequence
private IndexedFastaSequenceFile referenceReader;
private CachingIndexedFastaSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 900;
@ -324,15 +326,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
}
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
return new ActivityProfileResult(1.0);
return new ActivityProfileResult(ref.getLocus(), 1.0);
}
}
if( USE_ALLELES_TRIGGER ) {
return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
}
if( context == null ) { return new ActivityProfileResult(0.0); }
if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); }
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
@ -369,7 +371,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
}
//---------------------------------------------------------------------------------------------------------------

View File

@ -0,0 +1,108 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
/**
* Basic unit test for AlleleBiasedDownsamplingUtils
*/
public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
@Test
public void testSmartDownsampling() {
final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50};
final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0};
// no contamination, no removal
testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
// hom sample, het contaminant, different alleles
testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
// hom sample, hom contaminant, different alleles
testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts);
// het sample, het contaminant, different alleles
testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
// het sample, hom contaminant, different alleles
testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
// hom sample, het contaminant, overlapping alleles
final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0};
testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts);
// hom sample, hom contaminant, overlapping alleles
testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0});
// het sample, het contaminant, overlapping alleles
testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55});
testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
// het sample, hom contaminant, overlapping alleles
testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts);
}
private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction,
final int pileupSize, final int[] initialCounts, final int[] targetCounts) {
final int[] actualCounts = initialCounts.clone();
actualCounts[0] += addA;
actualCounts[1] += addC;
actualCounts[2] += addG;
actualCounts[3] += addT;
final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction));
Assert.assertTrue(countsAreEqual(results, targetCounts));
}
private static boolean countsAreEqual(final int[] counts1, final int[] counts2) {
for ( int i = 0; i < 4; i++ ) {
if ( counts1[i] != counts2[i] )
return false;
}
return true;
}
}

View File

@ -51,21 +51,21 @@ public class BQSRIntegrationTest extends WalkerTest {
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
String HiSeqInterval = "chr1:10,000,000-10,100,000";
return new Object[][]{
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "55a46d8f5d2f9acfa2d7659e18b6df43")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "8e930f56a8905a5999af7d6ba8a92f91")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "8e87bee4bd6531b405082c4da785f1f5")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "b309a5f57b861d7f31cb76cdac4ff8a7")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "4c75d47ed2cf93b499be8fbb29b24dfd")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "43b06e5568a89e4ce1dd9146ce580c89")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "25f4f48dba27475b0cd7c06ef0239aba")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "dfcba9acc32b4a1dfeceea135b48615a")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "e8077b721f2e6f51c1945b6f6236835c")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "fbdc8d0fd312e3a7f49063c580cf5d92")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "4f47415628201a4f3c33e48ec066677b")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "1e89d2b88f4218363b9322b38e9536f2")},
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "a7beb0b16756257a274eecf73474ed90")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "dfcba9acc32b4a1dfeceea135b48615a")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2082c70e08f1c14290c3812021832f83")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "f70d8b5358bc2f76696f14b7a807ede0")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5e657fd6a44dcdc7674b6e5a2de5dc83")},
};
}
@ -90,6 +90,21 @@ public class BQSRIntegrationTest extends WalkerTest {
executeTest("testBQSRFailWithoutDBSNP", spec);
}
@Test
public void testBQSRCSV() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
" -T BaseRecalibrator" +
" -R " + b36KGReference +
" -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
" -knownSites " + b36dbSNP129 +
" -L 1:10,000,000-10,200,000" +
" -o /dev/null" +
" --plot_pdf_file /dev/null" +
" --intermediate_csv_file %s",
Arrays.asList("d1c38a3418979400630e2bca1140689c"));
executeTest("testBQSR-CSVfile", spec);
}
@Test
public void testBQSRFailWithSolidNoCall() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(

View File

@ -14,6 +14,9 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam";
final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals";
final String L = " -L 20:10,100,000-10,120,000 ";
final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam";
final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam";
final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057";
private void RRTest(String testName, String args, String md5) {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
@ -21,36 +24,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
executeTest(testName, spec);
}
@Test(enabled = false)
@Test(enabled = true)
public void testDefaultCompression() {
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
RRTest("testDefaultCompression ", L, "98080d3c53f441564796fc143cf510da");
}
@Test(enabled = false)
@Test(enabled = true)
public void testMultipleIntervals() {
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
RRTest("testMultipleIntervals ", intervals, "c5dcdf4edf368b5b897d66f76034d9f0");
}
@Test(enabled = false)
@Test(enabled = true)
public void testHighCompression() {
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "27cb99e87eda5e46187e56f50dd37f26");
}
@Test(enabled = false)
@Test(enabled = true)
public void testLowCompression() {
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "4e7f111688d49973c35669855b7a2eaf");
}
@Test(enabled = false)
@Test(enabled = true)
public void testIndelCompression() {
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f6c9ea83608f35f113cf1f62a77ee6d0");
}
@Test(enabled = false)
@Test(enabled = true)
public void testFilteredDeletionCompression() {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841")));
}
/**
@ -61,20 +64,26 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
*
* This bam is simplified to replicate the exact bug with the three provided intervals.
*/
@Test(enabled = false)
@Test(enabled = true)
public void testAddingReadAfterTailingTheStash() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("647b0f0f95730de8e6bc4f74186ad4df")));
}
/**
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
* filtered out.
*/
@Test(enabled = false)
@Test(enabled = true)
public void testDivideByZero() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("2c87985972dd43ee9dd50b463d93a511")));
}
@Test(enabled = true)
public void testCoReduction() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("5c30fde961a1357bf72c15144c01981b")));
}
}

View File

@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("cdec335abc9ad8e59335e39a73e0e95a"));
Arrays.asList("847605f4efafef89529fe0e496315edd"));
executeTest("test MultiSample Pilot1", spec);
}
@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("efddb5e258f97fd4f6661cff9eaa57de"));
Arrays.asList("5b31b811072a4df04524e13604015f9b"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
}
@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("24532eb381724cd74e99370da28d49ed"));
Arrays.asList("d9992e55381afb43742cc9b30fcd7538"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("062a946160eec1d0fc135d58ca654ff4"));
Arrays.asList("fea530fdc8677e10be4cc11625fa5376"));
executeTest("test SingleSample Pilot2", spec);
}
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("94dc17d76d841f1d3a36160767ffa034"));
Arrays.asList("704888987baacff8c7b273b8ab9938d0"));
executeTest("test Multiple SNP alleles", spec);
}
@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("9106d01ca0d0a8fedd068e72d509f380"));
Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89"));
executeTest("test reverse trim", spec);
}
@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("d847acf841ba8ba653f996ce4869f439"));
Arrays.asList("fb204e821a24d03bd3a671b6e01c449a"));
executeTest("test mismatched PLs", spec);
}
@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "6792419c482e767a3deb28913ed2b1ad";
private final static String COMPRESSED_OUTPUT_MD5 = "5b8f477c287770b5769b05591e35bc2d";
@Test
public void testCompressedOutput() {
@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("56157d930da6ccd224bce1ca93f11e41"));
Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff"));
executeTest("test min_base_quality_score 26", spec);
}
@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("6ccb9bd88934e4272d0ce362dd35e603"));
Arrays.asList("55760482335497086458b09e415ecf54"));
executeTest("test SLOD", spec);
}
@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("480437dd6e2760f4ab3194431519f331"));
Arrays.asList("938e888a40182878be4c3cc4859adb69"));
executeTest("test NDA", spec);
}
@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("22c039412fd387dde6125b07c9a74a25"));
Arrays.asList("7dc186d420487e4e156a24ec8dea0951"));
executeTest("test using comp track", spec);
}
@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "40aeb4c9e31fe7046b72afc58e7599cb");
testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "c706ca93b25ff83613cb4e95dcac567c");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841");
testOutputParameters("--output_mode EMIT_ALL_SITES", "81fff490c0f59890f1e75dc290833434");
}
private void testOutputParameters(final String args, final String md5) {
@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("df524e98903d96ab9353bee7c16a69de"));
Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
executeTest("test confidence 1", spec1);
}
@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "8e61498ca03a8d805372a64c466b3b42" );
testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "668d06b5173cf3b97d052726988e1d7b" );
testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("908eb5e21fa39e7fb377cf4a9c4c7835"));
Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("c814558bb0ed2e19b12e1a2bf4465d52"));
Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("3593495aab5f6204c65de0b073a6ff65"));
Arrays.asList("50329e15e5139be9e3b643f0b3ba8a53"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("8b486a098029d5a106b0a37eff541c15"));
Arrays.asList("2b85e3bd6bf981afaf7324666740d74b"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("18efedc50cae2aacaba372265e38310b"));
Arrays.asList("a6fd46eff78827060451a62cffd698a7"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("3ff8c7c80a518aa3eb8671a21479de5f"));
Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("578c0540f4f2052a634a829bcb9cc27d"));
Arrays.asList("591332fa0b5b22778cf820ee257049d2"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e"));
Arrays.asList("a4761d7f25e7a62f34494801c98a0da7"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352"));
Arrays.asList("c526c234947482d1cd2ffc5102083a08"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("857b8e5df444463ac27f665c4f67fbe2"));
Arrays.asList("90adefd39ed67865b0cb275ad0f07383"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("81d4c7d9010fd6733b2997bc378e7471"));
Arrays.asList("2fded43949e258f8e9f68893c61c1bdd"));
executeTest("test minIndelFraction 0.25", spec);
}
@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNsInCigar() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1,
Arrays.asList("bd7984a374f0ae5d277bd5fc5065f64f"));
Arrays.asList("d6d40bacd540a41f305420dfea35e04a"));
executeTest("test calling on reads with Ns in CIGAR", spec);
}
@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("9a7cd58b9e3d5b72608c0d529321deba"));
Arrays.asList("c1077662411164182c5f75478344f83d"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamSNPs() {
testReducedCalling("SNP", "e7fc11baf208a1bca7b462d3148c936e");
testReducedCalling("SNP", "f5ccbc96d0d66832dd9b3c5cb6507db4");
}
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "132a4e0ccf9230b5bb4b56c649e2bdd5");
testReducedCalling("INDEL", "3c02ee5187933bed44dc416a2e28511f");
}
@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testContaminationDownsampling() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1,
Arrays.asList("27dd04159e06d9524fb8a4eef41f96ae"));
Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb"));
executeTest("test contamination_percentage_to_filter 0.20", spec);
}

View File

@ -21,17 +21,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e");
HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215");
HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -64,20 +64,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8");
}
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939"));
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -91,7 +91,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9"));
Arrays.asList("425f1a0fb00d7145edf1c55e54346fae"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
}

View File

@ -350,11 +350,11 @@ public class WalkerManager extends PluginManager<Walker> {
* @return A name for this type of walker.
*/
@Override
public String getName(Class<? extends Walker> walkerType) {
public String getName(Class walkerType) {
String walkerName = "";
if (walkerType.getAnnotation(WalkerName.class) != null)
walkerName = walkerType.getAnnotation(WalkerName.class).value().trim();
walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim();
else
walkerName = super.getName(walkerType);

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.contexts;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -39,10 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
* @author hanna
* @version 0.1
*/
public class ReferenceContext {
final public static boolean UPPERCASE_REFERENCE = true;
/**
* Facilitates creation of new GenomeLocs.
*/
@ -59,7 +55,8 @@ public class ReferenceContext {
final private GenomeLoc window;
/**
* The bases in the window around the current locus. If null, then bases haven't been fetched yet
* The bases in the window around the current locus. If null, then bases haven't been fetched yet.
* Bases are always upper cased
*/
private byte[] basesCache = null;
@ -81,7 +78,7 @@ public class ReferenceContext {
*
* @return
*/
@Ensures("result != null")
@Ensures({"result != null"})
public byte[] getBases();
}
@ -146,7 +143,9 @@ public class ReferenceContext {
private void fetchBasesFromProvider() {
if ( basesCache == null ) {
basesCache = basesProvider.getBases();
if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache);
// must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time
assert BaseUtils.isUpperCase(basesCache);
}
}
@ -194,6 +193,7 @@ public class ReferenceContext {
/**
* All the bases in the window from the current base forward to the end of the window.
*/
@Ensures({"result != null", "result.length > 0"})
public byte[] getForwardBases() {
final byte[] bases = getBases();
final int mid = locus.getStart() - window.getStart();

View File

@ -198,8 +198,6 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
source.close();
file.delete(); // this should be last to aid in debugging when the process fails
} catch (UserException e) {
throw new ReviewedStingException("BUG: intermediate file " + file + " is malformed, got error while reading", e);
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
public enum GATKReportVersion {
/**
@ -91,6 +92,6 @@ public enum GATKReportVersion {
if (header.startsWith("#:GATKReport.v1.1"))
return GATKReportVersion.V1_1;
throw new ReviewedStingException("Unknown GATK report version in header: " + header);
throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header);
}
}

View File

@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
@ -35,6 +34,9 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
// package access for unit testing
ActivityProfile profile;
@Override
public String getTraversalUnits() {
return "active regions";
@ -46,99 +48,127 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
T sum) {
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
final LocusView locusView = getLocusView( walker, dataProvider );
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE;
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
int minStart = Integer.MAX_VALUE;
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
if(prevLoc != null) {
// fill in the active / inactive labels from the stop of the previous location to the start of this location
// TODO refactor to separate function
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ));
}
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
// Note that this must occur before we leave because we are outside the intervals because
// reads may occur outside our intervals but overlap them in the future
// TODO -- this whole HashSet logic should be changed to a linked list of reads with
// TODO -- subsequent pass over them to find the ones overlapping the active regions
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
minStart = Math.min(minStart, read.getAlignmentStart());
}
prevLoc = location;
printProgress(locus.getLocation());
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
minStart = Math.min(minStart, read.getAlignmentStart());
}
updateCumulativeMetrics(dataProvider.getShard());
// skip this location -- it's not part of our engine intervals
// TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow
if ( outsideEngineIntervals(location) )
continue;
// Take the individual isActive calls and integrate them into contiguous active regions and
// add these blocks of work to the work queue
// band-pass filter the list of isActive probabilities and turn into active regions
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
}
}
workQueue.addAll( activeRegions );
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
// we've move across some interval boundary, restart profile
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
prevLoc = location;
printProgress(locus.getLocation());
}
updateCumulativeMetrics(dataProvider.getShard());
if ( ! profile.isEmpty() )
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
}
}
workQueue.addAll( activeRegions );
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
return sum;
}
/**
* Is the loc outside of the intervals being requested for processing by the GATK?
* @param loc
* @return
*/
private boolean outsideEngineIntervals(final GenomeLoc loc) {
return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc);
}
/**
* Take the individual isActive calls and integrate them into contiguous active regions and
* add these blocks of work to the work queue
* band-pass filter the list of isActive probabilities and turn into active regions
*
* @param profile
* @param activeRegions
* @param activeRegionExtension
* @param maxRegionSize
* @return
*/
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
final List<ActiveRegion> activeRegions,
final int activeRegionExtension,
final int maxRegionSize) {
if ( profile.isEmpty() )
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
}
// --------------------------------------------------------------------------------
//
@ -150,7 +180,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final RefMetaDataTracker tracker, final ReferenceContext refContext,
final AlignmentContext locus, final GenomeLoc location) {
if ( walker.hasPresetActiveRegions() ) {
return new ActivityProfileResult(walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
} else {
return walker.isActive( tracker, refContext, locus );
}
@ -250,30 +280,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
return walker.reduce( x, sum );
}
// --------------------------------------------------------------------------------
//
// engine interaction code
//
// --------------------------------------------------------------------------------
/**
* Gets the best view of loci for this walker given the available data.
* @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
return new AllLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
return new RodLocusView(dataProvider);
else
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine

View File

@ -277,8 +277,12 @@ public class VariantAnnotatorEngine {
if ( expression.fieldName.equals("ID") ) {
if ( vc.hasID() )
infoAnnotations.put(expression.fullName, vc.getID());
} else if (expression.fieldName.equals("ALT")) {
infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString());
} else if ( vc.hasAttribute(expression.fieldName) ) {
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
}
}
}

View File

@ -25,35 +25,41 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileHeader;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.recalibration.QuantizationInfo;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
import org.broadinstitute.sting.utils.recalibration.*;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
@ -103,19 +109,20 @@ import java.util.ArrayList;
* </pre>
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class})
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@By(DataSource.READS)
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
public class BaseRecalibrator extends LocusWalker<Long, Long> {
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@PartitionBy(PartitionType.READ)
public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSchedulable {
@ArgumentCollection
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates
@Advanced
@Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
public double BAQGOP = BAQ.DEFAULT_GOP;
private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization
private RecalibrationTables recalibrationTables;
private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental)
@ -124,12 +131,12 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
private int minimumQToUse;
protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
/**
* Parse the -cov arguments and create a list of covariates to be used here
@ -137,6 +144,8 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
*/
public void initialize() {
baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
// check for unsupported access
if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals)
throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument");
@ -185,13 +194,21 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
try {
// fasta reference reader for use with BAQ calculation
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
}
}
private RecalibrationEngine initializeRecalibrationEngine() {
final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class);
try {
Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null);
final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null);
constructor.setAccessible(true);
return (RecalibrationEngine)constructor.newInstance();
}
@ -200,60 +217,207 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
}
}
private boolean readHasBeenSkipped( final GATKSAMRecord read ) {
return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE);
}
private boolean isLowQualityBase( final PileupElement p ) {
return p.getQual() < minimumQToUse;
}
private boolean readNotSeen( final GATKSAMRecord read ) {
return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE);
private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
return read.getBaseQualities()[offset] < minimumQToUse;
}
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
*
* @param tracker the reference metadata tracker
* @param ref the reference context
* @param context the alignment context
* @return returns 1, but this value isn't used in the reduce step
*/
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
long countedSites = 0L;
// Only analyze sites not present in the provided known sites
if (tracker.getValues(RAC.knownSites).size() == 0) {
for (final PileupElement p : context.getBasePileup()) {
final GATKSAMRecord read = p.getRead();
final int offset = p.getOffset();
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) )
continue;
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead);
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
if (readNotSeen(read)) {
read.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
RecalUtils.parsePlatformForRead(read, RAC);
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) {
read.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
continue;
RecalUtils.parsePlatformForRead(read, RAC);
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
return 0L; // skip this read completely
}
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
final byte[] baqArray = calculateBAQArray(read);
if( baqArray != null ) { // some reads just can't be BAQ'ed
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
recalibrationEngine.updateDataForRead(read, skip, snpErrors, insertionErrors, deletionErrors);
return 1L;
} else {
return 0L;
}
}
protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
final byte[] bases = read.getReadBases();
final boolean[] skip = new boolean[bases.length];
final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
for( int iii = 0; iii < bases.length; iii++ ) {
skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
}
return skip;
}
protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
}
protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
final int BUFFER_SIZE = 0;
final int readLength = read.getReadBases().length;
final boolean[] knownSites = new boolean[readLength];
Arrays.fill(knownSites, false);
for( final Feature f : features ) {
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; }
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; }
Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true);
}
return knownSites;
}
// BUGBUG: can be merged with calculateIsIndel
protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
final byte[] readBases = read.getReadBases();
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
final int[] snp = new int[readBases.length];
int readPos = 0;
int refPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
switch (ce.getOperator()) {
case M:
case EQ:
case X:
for( int iii = 0; iii < elementLength; iii++ ) {
snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
readPos++;
refPos++;
}
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
}
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if (!ReadUtils.isSOLiDRead(read) ||
RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING ||
RecalUtils.isColorSpaceConsistent(read, offset))
// This base finally passed all the checks for a good base, so add it to the big data hashmap
recalibrationEngine.updateDataForPileupElement(p, ref.getBase());
break;
case D:
case N:
refPos += elementLength;
break;
case I:
case S: // ReferenceContext doesn't have the soft clipped bases!
readPos += elementLength;
break;
case H:
case P:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
}
countedSites++;
}
return snp;
}
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
final byte[] readBases = read.getReadBases();
final int[] indel = new int[readBases.length];
Arrays.fill(indel, 0);
int readPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
switch (ce.getOperator()) {
case M:
case EQ:
case X:
case S:
{
readPos += elementLength;
break;
}
case D:
{
final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
break;
}
case I:
{
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
if( forwardStrandRead ) {
indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
}
for (int iii = 0; iii < elementLength; iii++) {
readPos++;
}
if( !forwardStrandRead ) {
indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
}
break;
}
case N:
case H:
case P:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
}
}
return indel;
}
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
if(errorArray.length != baqArray.length ) {
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
}
return countedSites;
final byte NO_BAQ_UNCERTAINTY = (byte)'@';
final int BLOCK_START_UNSET = -1;
final double[] fractionalErrors = new double[baqArray.length];
Arrays.fill(fractionalErrors, 0.0);
boolean inBlock = false;
int blockStartIndex = BLOCK_START_UNSET;
int iii;
for( iii = 0; iii < fractionalErrors.length; iii++ ) {
if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
if( !inBlock ) {
fractionalErrors[iii] = (double) errorArray[iii];
} else {
calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
inBlock = false; // reset state variables
blockStartIndex = BLOCK_START_UNSET; // reset state variables
}
} else {
inBlock = true;
if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
}
}
if( inBlock ) {
calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
}
if( fractionalErrors.length != errorArray.length ) {
throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
}
return fractionalErrors;
}
private static void calculateAndStoreErrorsInBlock( final int iii,
final int blockStartIndex,
final int[] errorArray,
final double[] fractionalErrors ) {
int totalErrors = 0;
for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
totalErrors += errorArray[jjj];
}
for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
}
}
private byte[] calculateBAQArray( final GATKSAMRecord read ) {
baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
return BAQ.getBAQTag(read);
}
/**
@ -286,12 +450,12 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
generateReport();
logger.info("...done!");
if (RAC.RECAL_PDF_FILE != null) {
if ( RAC.RECAL_PDF_FILE != null ) {
logger.info("Generating recalibration plots...");
generatePlots();
}
logger.info("Processed: " + result + " sites");
logger.info("Processed: " + result + " reads");
}
private void generatePlots() {
@ -304,7 +468,6 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates);
}
/**
* go through the quality score table and use the # observations and the empirical quality score
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
@ -317,5 +480,4 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
private void generateReport() {
RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates);
}
}
}

View File

@ -75,8 +75,9 @@ public class RecalibrationArgumentCollection {
/**
* If not provided, then a temporary file is created and then deleted upon completion.
* For advanced users only.
*/
@Hidden
@Advanced
@Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
public File RECAL_CSV_FILE = null;

View File

@ -33,7 +33,5 @@ public interface RecalibrationEngine {
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables);
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase);
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors);
}

View File

@ -46,41 +46,30 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
this.recalibrationTables = recalibrationTables;
}
/**
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches for mismatches only.
*
* @param pileupElement The pileup element to update
* @param refBase The reference base at this locus
*/
@Override
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
final int offset = pileupElement.getOffset();
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
final byte qual = pileupElement.getQual();
final boolean isError = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase);
final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION);
final int eventIndex = EventType.BASE_SUBSTITUTION.index;
// TODO: should this really be combine rather than increment?
combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
for (int i = 2; i < covariates.length; i++) {
if (keys[i] < 0)
continue;
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
}
}
@Override
public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version");
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
if( !skip[offset] ) {
final ReadCovariates readCovariates = covariateKeySetFrom(read);
final byte qual = read.getBaseQualities()[offset];
final double isError = snpErrors[offset];
final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION);
final int eventIndex = EventType.BASE_SUBSTITUTION.index;
combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
for (int i = 2; i < covariates.length; i++) {
if (keys[i] < 0)
continue;
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
}
}
}
}
/**
@ -90,10 +79,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
* @param isError whether or not the observation is an error
* @return a new RecalDatum object with the observation and the error
*/
protected RecalDatum createDatumObject(final byte reportedQual, final boolean isError) {
return new RecalDatum(1, isError ? 1:0, reportedQual);
}
protected RecalDatum createDatumObject(final byte reportedQual, final double isError) {
return new RecalDatum(1, isError, reportedQual);
}
@ -108,46 +93,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE);
}
/**
* Increments the RecalDatum at the specified position in the specified table, or put a new item there
* if there isn't already one.
*
* Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put()
* to return false if another thread inserts a new item at our position in the middle of our put operation.
*
* @param table the table that holds/will hold our item
* @param qual qual for this event
* @param isError error status for this event
* @param keys location in table of our item
*/
protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray<RecalDatum> table, final byte qual, final boolean isError, final int... keys ) {
final RecalDatum existingDatum = table.get(keys);
// There are three cases here:
//
// 1. The original get succeeded (existingDatum != null) because there already was an item in this position.
// In this case we can just increment the existing item.
//
// 2. The original get failed (existingDatum == null), and we successfully put a new item in this position
// and are done.
//
// 3. The original get failed (existingDatum == null), AND the put fails because another thread comes along
// in the interim and puts an item in this position. In this case we need to do another get after the
// failed put to get the item the other thread put here and increment it.
if ( existingDatum == null ) {
// No existing item, try to put a new one
if ( ! table.put(createDatumObject(qual, isError), keys) ) {
// Failed to put a new item because another thread came along and put an item here first.
// Get the newly-put item and increment it (item is guaranteed to exist at this point)
table.get(keys).increment(isError);
}
}
else {
// Easy case: already an item here, so increment it
existingDatum.increment(isError);
}
}
/**
* Increments the RecalDatum at the specified position in the specified table, or put a new item there
* if there isn't already one.
@ -177,36 +122,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
}
}
/**
* Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a
* new item there if there isn't already one.
*
* Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put()
* to return false if another thread inserts a new item at our position in the middle of our put operation.
*
* @param table the table that holds/will hold our item
* @param qual qual for this event
* @param isError error status for this event
* @param keys location in table of our item
*/
protected void combineDatumOrPutIfNecessary( final NestedIntegerArray<RecalDatum> table, final byte qual, final boolean isError, final int... keys ) {
final RecalDatum existingDatum = table.get(keys);
final RecalDatum newDatum = createDatumObject(qual, isError);
if ( existingDatum == null ) {
// No existing item, try to put a new one
if ( ! table.put(newDatum, keys) ) {
// Failed to put a new item because another thread came along and put an item here first.
// Get the newly-put item and combine it with our item (item is guaranteed to exist at this point)
table.get(keys).combine(newDatum);
}
}
else {
// Easy case: already an item here, so combine it with our item
existingDatum.combine(newDatum);
}
}
/**
* Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a
* new item there if there isn't already one.

View File

@ -0,0 +1,73 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.SortedSet;
/**
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
* with them.
*
* However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot
* be created easily for small tasks that do not require the rigors of the GenomeLocParser validation
*
* SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should
* only be used outside of the engine.
*
* User: carneiro
* Date: 10/16/12
* Time: 2:07 PM
*/
public class SimpleGenomeLoc extends GenomeLoc {
private boolean finished;
public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) {
super(contigName, contigIndex, start, stop);
this.finished = finished;
}
public boolean isFinished() {
return finished;
}
@Requires("a != null && b != null")
public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException {
if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) {
throw new ReviewedStingException("Tried to merge unmapped genome locs");
}
if (!(a.contiguousP(b))) {
throw new ReviewedStingException("The two genome locs need to be contiguous");
}
return new SimpleGenomeLoc(a.getContig(), a.contigIndex,
Math.min(a.getStart(), b.getStart()),
Math.max(a.getStop(), b.getStop()),
a.isFinished());
}
/**
* Merges a list of *sorted* *contiguous* locs into one
*
* @param sortedLocs a sorted list of contiguous locs
* @return one merged loc
*/
public static SimpleGenomeLoc merge(SortedSet<SimpleGenomeLoc> sortedLocs) {
SimpleGenomeLoc previousLoc = null;
for (SimpleGenomeLoc loc : sortedLocs) {
if (loc.isUnmapped()) {
throw new ReviewedStingException("Tried to merge unmapped genome locs");
}
if (previousLoc != null && !previousLoc.contiguousP(loc)) {
throw new ReviewedStingException("The genome locs need to be contiguous");
}
previousLoc = loc;
}
SimpleGenomeLoc firstLoc = sortedLocs.first();
SimpleGenomeLoc lastLoc = sortedLocs.last();
return merge(firstLoc, lastLoc);
}
}

View File

@ -191,7 +191,7 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
*/
@Advanced
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
OutputFormat outputFormat;
OutputFormat outputFormat = OutputFormat.BED;
public enum OutputFormat {
/**
@ -297,7 +297,7 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
}
public String toString() {
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
return String.format("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart()-1, loc.getStop(), state);
}
}

View File

@ -57,7 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
// note the linear probability scale
return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1));
return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
}

View File

@ -190,6 +190,10 @@ public class UnifiedGenotyperEngine {
final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
if ( vc != null )
results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set
// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
// results.add(generateEmptyContext(tracker, refContext, null, rawContext));
}
}
}

View File

@ -25,7 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import com.google.java.contract.Requires;
import net.sf.samtools.*;
import net.sf.samtools.util.RuntimeIOException;
import net.sf.samtools.util.SequenceUtil;
@ -236,6 +236,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
* then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the
* value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output
* file name (2nd column) for each input file name (1st column).
*
* Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing).
*/
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
protected String N_WAY_OUT = null;
@ -274,7 +276,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
protected String OUT_SNPS = null;
// fasta reference reader to supplement the edges of the reference sequence
private IndexedFastaSequenceFile referenceReader;
private CachingIndexedFastaSequenceFile referenceReader;
// the intervals input by the user
private Iterator<GenomeLoc> intervals = null;
@ -1601,7 +1603,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
public List<GATKSAMRecord> getReads() { return reads; }
public byte[] getReference(IndexedFastaSequenceFile referenceReader) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getReference(CachingIndexedFastaSequenceFile referenceReader) {
// set up the reference if we haven't done so yet
if ( reference == null ) {
// first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read)
@ -1609,7 +1612,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
int padRight = Math.min(loc.getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(loc.getContig()).getSequenceLength());
loc = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), padLeft, padRight);
reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
StringUtil.toUpperCase(reference);
}
return reference;

View File

@ -183,6 +183,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
@Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
private boolean keepSitesWithAC0 = false;
@Hidden
@Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
private int numSamplesFromArgument = 0;
/**
* If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
* variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
@ -589,6 +593,14 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
public int getNumberOfSamplesForEvaluation() {
if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty())
return sampleNamesForEvaluation.size();
else {
return numSamplesFromArgument;
}
}
public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
public List<RodBinding<VariantContext>> getComps() { return comps; }

View File

@ -1,249 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.ArrayList;
import java.util.HashMap;
/**
* @author rpoplin
* @since Apr 6, 2010
*/
//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score")
@Deprecated
public class VariantQualityScore {
// TODO - this should really be a stratification
// public class VariantQualityScore extends VariantEvaluator {
//
// // a mapping from quality score histogram bin to Ti/Tv ratio
// @DataPoint(description = "the Ti/Tv ratio broken out by variant quality")
// TiTvStats titvStats = null;
//
// @DataPoint(description = "average variant quality for each allele count")
// AlleleCountStats alleleCountStats = null;
//
// static class TiTvStats extends TableType {
// final static int NUM_BINS = 20;
// final HashMap<Integer, Pair<Long,Long>> qualByIsTransition = new HashMap<Integer, Pair<Long,Long>>(); // A hashMap holds all the qualities until we are able to bin them appropriately
// final long transitionByQuality[] = new long[NUM_BINS];
// final long transversionByQuality[] = new long[NUM_BINS];
// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out
//
// public Object[] getRowKeys() {
// return new String[]{"sample"};
// }
//
// public Object[] getColumnKeys() {
// final String columnKeys[] = new String[NUM_BINS];
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
// columnKeys[iii] = "titvBin" + iii;
// }
// return columnKeys;
// }
//
// public String getCell(int x, int y) {
// return String.valueOf(titvByQuality[y]);
// }
//
// public String toString() {
// StringBuffer returnString = new StringBuffer();
// // output the ti/tv array
// returnString.append("titvByQuality: ");
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
// returnString.append(titvByQuality[iii]);
// returnString.append(" ");
// }
// return returnString.toString();
// }
//
// public void incrValue( final double qual, final boolean isTransition ) {
// final Integer qualKey = Math.round((float) qual);
// final long numTransition = (isTransition ? 1L : 0L);
// final long numTransversion = (isTransition ? 0L : 1L);
// if( qualByIsTransition.containsKey(qualKey) ) {
// Pair<Long,Long> transitionPair = qualByIsTransition.get(qualKey);
// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion);
// qualByIsTransition.put(qualKey, transitionPair);
// } else {
// qualByIsTransition.put(qualKey, new Pair<Long,Long>(numTransition,numTransversion));
// }
// }
//
// public void organizeTiTvTables() {
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
// transitionByQuality[iii] = 0L;
// transversionByQuality[iii] = 0L;
// titvByQuality[iii] = 0.0;
// }
//
// int maxQual = 0;
//
// // Calculate the maximum quality score in order to normalize and histogram
// for( final Integer qual : qualByIsTransition.keySet() ) {
// if( qual > maxQual ) {
// maxQual = qual;
// }
// }
//
// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1));
//
// for( final Integer qual : qualByIsTransition.keySet() ) {
// final int index = (int)Math.floor( ((double) qual) / binSize );
// if( index >= 0 ) { // BUGBUG: why is there overflow here?
// Pair<Long,Long> transitionPair = qualByIsTransition.get(qual);
// transitionByQuality[index] += transitionPair.getFirst();
// transversionByQuality[index] += transitionPair.getSecond();
// }
// }
//
// for( int iii = 0; iii < NUM_BINS; iii++ ) {
// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
// } else {
// titvByQuality[iii] = 0.0;
// }
// }
//
// }
// }
//
// class AlleleCountStats extends TableType {
// final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
// final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
//
// public Object[] getRowKeys() {
// final int NUM_BINS = qualityListMap.keySet().size();
// final String rowKeys[] = new String[NUM_BINS];
// int iii = 0;
// for( final Integer key : qualityListMap.keySet() ) {
// rowKeys[iii] = "AC" + key;
// iii++;
// }
// return rowKeys;
//
// }
//
// public Object[] getColumnKeys() {
// return new String[]{"alleleCount","avgQual"};
// }
//
// public String getCell(int x, int y) {
// int iii = 0;
// for( final Integer key : qualityListMap.keySet() ) {
// if(iii == x) {
// if(y == 0) { return String.valueOf(key); }
// else { return String.valueOf(qualityMap.get(key)); }
// }
// iii++;
// }
// return null;
// }
//
// public String toString() {
// String returnString = "";
// // output the quality map
// returnString += "AlleleCountStats: ";
// //for( int iii = 0; iii < NUM_BINS; iii++ ) {
// // returnString += titvByQuality[iii] + " ";
// //}
// return returnString;
// }
//
// public void incrValue( final double qual, final int alleleCount ) {
// ArrayList<Double> list = qualityListMap.get(alleleCount);
// if(list==null) { list = new ArrayList<Double>(); }
// list.add(qual);
// qualityListMap.put(alleleCount, list);
// }
//
// public void organizeAlleleCountTables() {
// for( final Integer key : qualityListMap.keySet() ) {
// final ArrayList<Double> list = qualityListMap.get(key);
// double meanQual = 0.0;
// final double numQuals = (double)list.size();
// for( Double qual : list ) {
// meanQual += qual / numQuals;
// }
// qualityMap.put(key, meanQual);
// }
// }
// }
//
// //public VariantQualityScore(VariantEvalWalker parent) {
// //super(parent);
// //}
//
// public String getName() {
// return "VariantQualityScore";
// }
//
// public int getComparisonOrder() {
// return 1; // we only need to see each eval track
// }
//
// public String toString() {
// return getName();
// }
//
// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
// final String interesting = null;
//
// if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
// if( titvStats == null ) { titvStats = new TiTvStats(); }
// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval));
//
// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
// int alternateAlleleCount = 0;
// for (final Allele a : eval.getAlternateAlleles()) {
// alternateAlleleCount += eval.getCalledChrCount(a);
// }
// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
// }
//
// return interesting; // This module doesn't capture any interesting sites, so return null
// }
//
// public void finalizeEvaluation() {
// if( titvStats != null ) {
// titvStats.organizeTiTvTables();
// }
// if( alleleCountStats != null ) {
// alleleCountStats.organizeAlleleCountTables();
// }
// }
}

View File

@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier {
// There are ploidy x n sample chromosomes
// TODO -- generalize to handle multiple ploidy
nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy();
nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy();
if ( nchrom < 2 )
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample");

View File

@ -659,7 +659,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED)));
}
private boolean haveSameGenotypes(Genotype g1, Genotype g2) {
private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) {
if ( g1 == null || g2 == null )
return false;
if ((g1.isCalled() && g2.isFiltered()) ||
(g2.isCalled() && g1.isFiltered()) ||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))

View File

@ -46,16 +46,20 @@ import java.util.Set;
/**
* Strictly validates a variants file.
* Validates a VCF file with an extra strict set of criteria.
*
* <p>
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
* Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs
* when a dbSNP file is provided, and that all alternate alleles are present in at least one sample.
* In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure
* the information contained within the file is correct. Checks include the correctness of the reference base(s),
* accuracy of AC & AN values, tests against rsIDs when a dbSNP file is provided, and that all alternate alleles
* are present in at least one sample.
*
* If you are looking simply to test the adherence to the VCF specification, use --validationType NONE.
*
* <h2>Input</h2>
* <p>
* A variant set to filter.
* A variant set to validate.
* </p>
*
* <h2>Examples</h2>
@ -79,10 +83,9 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public enum ValidationType {
ALL, REF, IDS, ALLELES, CHR_COUNTS
ALL, REF, IDS, ALLELES, CHR_COUNTS, NONE
}
@Hidden
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
protected ValidationType type = ValidationType.ALL;
@ -172,7 +175,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
numErrors++;
logger.warn("***** " + e.getMessage() + " *****");
} else {
throw new UserException.MalformedFile(file, e.getMessage());
throw new UserException.FailsStrictValidation(file, e.getMessage());
}
}
}

View File

@ -24,33 +24,6 @@ public class BaseUtils {
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
public enum Base {
A('A', 0),
C('C', 1),
G('G', 2),
T('T', 3);
byte b;
int index;
private Base(char base, int index) {
this.b = (byte) base;
this.index = index;
}
public byte getBase() { return b; }
public char getBaseAsChar() { return (char) b; }
public int getIndex() { return index; }
public boolean sameBase(byte o) { return b == o; }
public boolean sameBase(char o) { return b == (byte) o; }
public boolean sameBase(int i) { return index == i; }
}
static private final int[] baseIndexMap = new int[256];
static {
Arrays.fill(baseIndexMap, -1);
@ -130,6 +103,17 @@ public class BaseUtils {
return false;
}
public static boolean isUpperCase(final byte[] bases) {
for ( byte base : bases )
if ( ! isUpperCase(base) )
return false;
return true;
}
public static boolean isUpperCase(final byte base) {
return base >= 'A' && base <= 'Z';
}
/**
* Converts a IUPAC nucleotide code to a pair of bases
*
@ -271,59 +255,6 @@ public class BaseUtils {
}
}
/**
* Converts a base index to a base index representing its cross-talk partner
*
* @param baseIndex 0, 1, 2, 3
* @return 1, 0, 3, 2, or -1 if the index can't be understood
*/
static public int crossTalkPartnerIndex(int baseIndex) {
switch (baseIndex) {
case 0:
return 1; // A -> C
case 1:
return 0; // C -> A
case 2:
return 3; // G -> T
case 3:
return 2; // T -> G
default:
return -1;
}
}
/**
* Converts a base to the base representing its cross-talk partner
*
* @param base [AaCcGgTt]
* @return C, A, T, G, or '.' if the base can't be understood
*/
@Deprecated
static public char crossTalkPartnerBase(char base) {
return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
}
/**
* Return the complement of a base index.
*
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
* @return the complementary base index
*/
static public byte complementIndex(int baseIndex) {
switch (baseIndex) {
case 0:
return 3; // a -> t
case 1:
return 2; // c -> g
case 2:
return 1; // g -> c
case 3:
return 0; // t -> a
default:
return -1; // wtf?
}
}
/**
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
*
@ -350,7 +281,7 @@ public class BaseUtils {
}
@Deprecated
static public char simpleComplement(char base) {
static private char simpleComplement(char base) {
return (char) simpleComplement((byte) base);
}
@ -370,22 +301,6 @@ public class BaseUtils {
return rcbases;
}
/**
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
*
* @param bases the byte array of bases
* @return the complement of the base byte array
*/
static public byte[] simpleComplement(byte[] bases) {
byte[] rcbases = new byte[bases.length];
for (int i = 0; i < bases.length; i++) {
rcbases[i] = simpleComplement(bases[i]);
}
return rcbases;
}
/**
* Reverse complement a char array of bases
*
@ -403,23 +318,6 @@ public class BaseUtils {
return rcbases;
}
/**
* Complement a char array of bases
*
* @param bases the char array of bases
* @return the complement of the base char array
*/
@Deprecated
static public char[] simpleComplement(char[] bases) {
char[] rcbases = new char[bases.length];
for (int i = 0; i < bases.length; i++) {
rcbases[i] = simpleComplement(bases[i]);
}
return rcbases;
}
/**
* Reverse complement a String of bases. Preserves ambiguous bases.
*
@ -431,17 +329,6 @@ public class BaseUtils {
return new String(simpleReverseComplement(bases.getBytes()));
}
/**
* Complement a String of bases. Preserves ambiguous bases.
*
* @param bases the String of bases
* @return the complement of the String
*/
@Deprecated
static public String simpleComplement(String bases) {
return new String(simpleComplement(bases.getBytes()));
}
/**
* Returns the uppercased version of the bases
*
@ -543,82 +430,4 @@ public class BaseUtils {
return randomBaseIndex;
}
/**
* Return a random base (A, C, G, T).
*
* @return a random base (A, C, G, T)
*/
@Deprecated
static public byte getRandomBase() {
return getRandomBase('.');
}
/**
* Return a random base, excluding some base.
*
* @param excludeBase the base to exclude
* @return a random base, excluding the one specified (A, C, G, T)
*/
@Deprecated
static public byte getRandomBase(char excludeBase) {
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
}
/**
* Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
* of 2 (it has a period of 4 as well), and so does
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
* or if specified minPeriod is greater than the sequence length.
*
* @param seq
* @return
*/
public static int sequencePeriod(byte[] seq, int minPeriod) {
int period = (minPeriod > seq.length ? seq.length : minPeriod);
// we assume that bases [0,period-1] repeat themselves and check this assumption
// until we find correct period
for (int pos = period; pos < seq.length; pos++) {
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
// if our current hypothesis holds, base[pos] must be the same as base[offset]
if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
// period we have been trying so far does not work.
// two possibilities:
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
// pos will be autoincremented and we will be checking next base
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
// hence we should first check if it matches the first base of the sequence, and to do that
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
// on the next loop re-entrance after pos is autoincremented)
if (offset == 0)
period = pos + 1;
else
period = pos--;
}
}
return period;
}
}
/* code snippet for testing sequencePeriod():
*
* String str = "CCTTG";
int p = 0;
System.out.print("Periods of " + str +" are:");
while ( p < str.length() ) {
p = sequencePeriod(str, p+1);
System.out.print(" "+p);
}
System.out.println(); System.exit(1);
*/

View File

@ -687,23 +687,69 @@ public class Utils {
array[i] = value;
}
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
SAMFileHeader header = toolkit.getSAMFileHeader();
/**
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets
* up the writer with the header and presorted status.
*
* @param toolkit the engine
* @param originalHeader original header
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
* @param programRecord the program record for this program
*/
public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) {
SAMFileHeader header = originalHeader.clone();
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
for ( SAMProgramRecord record : oldRecords )
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS )
if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS )
newRecords.add(record);
newRecords.add(programRecord);
header.setProgramRecords(newRecords);
return header;
}
/**
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns
* the new header to be added to the BAM writer.
*
* @param toolkit the engine
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
* @param walker the walker object (so we can extract the command line)
* @param PROGRAM_RECORD_NAME the name for the PG tag
* @return a pre-filled header for the bam writer
*/
public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord);
}
/**
* Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets
* up the writer with the header and presorted status.
*
* @param writer BAM file writer
* @param toolkit the engine
* @param preSorted whether or not the writer can assume reads are going to be added are already sorted
* @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file
* @param walker the walker object (so we can extract the command line)
* @param PROGRAM_RECORD_NAME the name for the PG tag
*/
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME);
writer.writeHeader(header);
writer.setPresorted(preSorted);
}
/**
* Creates a program record (@PG) tag
*
* @param toolkit the engine
* @param walker the walker object (so we can extract the command line)
* @param PROGRAM_RECORD_NAME the name for the PG tag
* @return a program record for the tool
*/
public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) {
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
@ -858,4 +904,5 @@ public class Utils {
}
return subLists;
}
}

View File

@ -1,11 +1,11 @@
package org.broadinstitute.sting.utils.activeregion;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.util.StringUtil;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation {
public ArrayList<GATKSAMRecord> getReads() { return reads; }
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) {
return getActiveRegionReference(referenceReader, 0);
}
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
return getReference( referenceReader, padding, extendedLoc );
}
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) {
return getFullReference(referenceReader, 0);
}
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
return getReference( referenceReader, padding, fullExtentReferenceLoc );
}
private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
@Requires("referenceReader.isUppercasingBases()")
private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(),
Math.max(1, genomeLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
StringUtil.toUpperCase(reference);
return reference;
}

View File

@ -24,11 +24,11 @@
package org.broadinstitute.sting.utils.activeregion;
import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collections;
@ -45,6 +45,7 @@ public class ActivityProfile {
final GenomeLocParser parser;
final boolean presetRegions;
GenomeLoc regionStartLoc = null;
GenomeLoc regionStopLoc = null;
final List<ActivityProfileResult> isActiveList;
private static final int FILTER_SIZE = 80;
private static final double[] GaussianKernel;
@ -71,19 +72,54 @@ public class ActivityProfile {
this.regionStartLoc = regionStartLoc;
}
public void add(final GenomeLoc loc, final ActivityProfileResult result) {
if ( loc.size() != 1 )
throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" );
isActiveList.add(result);
if( regionStartLoc == null ) {
@Override
public String toString() {
return "ActivityProfile{" +
"start=" + regionStartLoc +
", stop=" + regionStopLoc +
'}';
}
/**
* Add the next ActivityProfileResult to this profile.
*
* Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown
*
* @param result a well-formed ActivityProfileResult result to incorporate into this profile
*/
@Requires("result != null")
public void add(final ActivityProfileResult result) {
final GenomeLoc loc = result.getLoc();
if ( regionStartLoc == null ) {
regionStartLoc = loc;
regionStopLoc = loc;
} else {
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
regionStopLoc = loc;
}
isActiveList.add(result);
}
// for unit testing
public List<ActivityProfileResult> getActiveList() {
return isActiveList;
}
public int size() {
return isActiveList.size();
}
public boolean isEmpty() {
return isActiveList.isEmpty();
}
public boolean hasPresetRegions() {
return presetRegions;
}
/**
* Band pass this ActivityProfile, producing a new profile that's band pass filtered
* @return a new ActivityProfile that's the band-pass filtered version of this profile
@ -104,14 +140,21 @@ public class ActivityProfile {
}
iii++;
}
final double[] filteredProbArray = new double[activeProbArray.length];
final double[] filteredProbArray;
if( !presetRegions ) {
// if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray
filteredProbArray = new double[activeProbArray.length];
for( iii = 0; iii < activeProbArray.length; iii++ ) {
final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
}
} else {
// otherwise we simply use the activeProbArray directly
filteredProbArray = activeProbArray;
}
iii = 0;
for( final double prob : filteredProbArray ) {
final ActivityProfileResult result = isActiveList.get(iii++);
@ -119,6 +162,7 @@ public class ActivityProfile {
result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE;
result.resultValue = null;
}
return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc);
}
@ -166,6 +210,7 @@ public class ActivityProfile {
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
}
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
if( !isActive || curEnd - curStart < maxRegionSize ) {
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);

View File

@ -1,12 +1,16 @@
package org.broadinstitute.sting.utils.activeregion;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 7/27/12
*/
public class ActivityProfileResult {
private GenomeLoc loc;
public double isActiveProb;
public ActivityProfileResultState resultState;
public Number resultValue;
@ -16,16 +20,52 @@ public class ActivityProfileResult {
HIGH_QUALITY_SOFT_CLIPS
}
public ActivityProfileResult( final double isActiveProb ) {
this.isActiveProb = isActiveProb;
this.resultState = ActivityProfileResultState.NONE;
this.resultValue = null;
/**
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb
*
* @param loc the position of the result profile (for debugging purposes)
* @param isActiveProb the probability of being active (between 0 and 1)
*/
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) {
this(loc, isActiveProb, ActivityProfileResultState.NONE, null);
}
public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
/**
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some
* information about the result state and value (TODO RYAN -- what do these mean?)
*
* @param loc the position of the result profile (for debugging purposes)
* @param isActiveProb the probability of being active (between 0 and 1)
*/
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
// make sure the location of that activity profile is 1
if ( loc.size() != 1 )
throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc);
this.loc = loc;
this.isActiveProb = isActiveProb;
this.resultState = resultState;
this.resultValue = resultValue;
}
/**
* Get the genome loc associated with the ActivityProfileResult
* @return the location of this result
*/
@Ensures("result != null")
public GenomeLoc getLoc() {
return loc;
}
@Override
public String toString() {
return "ActivityProfileResult{" +
"loc=" + loc +
", isActiveProb=" + isActiveProb +
", resultState=" + resultState +
", resultValue=" + resultValue +
'}';
}
}

View File

@ -101,7 +101,7 @@ public class PluginManager<PluginType> {
* Create a new plugin manager.
* @param pluginType Core type for a plugin.
*/
public PluginManager(Class<PluginType> pluginType) {
public PluginManager(Class pluginType) {
this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null);
}
@ -110,7 +110,7 @@ public class PluginManager<PluginType> {
* @param pluginType Core type for a plugin.
* @param classpath Custom class path to search for classes.
*/
public PluginManager(Class<PluginType> pluginType, List<URL> classpath) {
public PluginManager(Class pluginType, List<URL> classpath) {
this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath);
}
@ -120,7 +120,7 @@ public class PluginManager<PluginType> {
* @param pluginCategory Provides a category name to the plugin. Must not be null.
* @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null.
*/
public PluginManager(Class<PluginType> pluginType, String pluginCategory, String pluginSuffix) {
public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) {
this(pluginType, pluginCategory, pluginSuffix, null);
}
@ -131,7 +131,7 @@ public class PluginManager<PluginType> {
* @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null.
* @param classpath Custom class path to search for classes.
*/
public PluginManager(Class<PluginType> pluginType, String pluginCategory, String pluginSuffix, List<URL> classpath) {
public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List<URL> classpath) {
this.pluginCategory = pluginCategory;
this.pluginSuffix = pluginSuffix;
@ -149,6 +149,7 @@ public class PluginManager<PluginType> {
}
// Load all classes types filtering them by concrete.
@SuppressWarnings("unchecked")
Set<Class<? extends PluginType>> allTypes = reflections.getSubTypesOf(pluginType);
for( Class<? extends PluginType> type: allTypes ) {
// The plugin manager does not support anonymous classes; to be a plugin, a class must have a name.
@ -325,7 +326,7 @@ public class PluginManager<PluginType> {
* @param pluginType The type of plugin.
* @return A name for this type of plugin.
*/
public String getName(Class<? extends PluginType> pluginType) {
public String getName(Class pluginType) {
String pluginName = "";
if (pluginName.length() == 0) {

View File

@ -587,7 +587,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);

View File

@ -283,6 +283,12 @@ public class UserException extends ReviewedStingException {
}
}
public static class FailsStrictValidation extends UserException {
public FailsStrictValidation(File f, String message) {
super(String.format("File %s fails strict validation: %s", f.getAbsolutePath(), message));
}
}
public static class MalformedFile extends UserException {
public MalformedFile(String message) {
super(String.format("Unknown file is malformed: %s", message));

View File

@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.StringUtil;
import org.apache.log4j.Priority;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -40,6 +41,8 @@ import java.util.Arrays;
* A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
*
* Thread-safe! Uses a thread-local cache
*
* Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set
*/
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class);
@ -54,10 +57,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
public static final long DEFAULT_CACHE_SIZE = 1000000;
/** The cache size of this CachingIndexedFastaSequenceFile */
final long cacheSize;
private final long cacheSize;
/** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */
final long cacheMissBackup;
private final long cacheMissBackup;
/**
* If true, we will preserve the case of the original base in the genome, not
*/
private final boolean preserveCase;
// information about checking efficiency
long cacheHits = 0;
@ -84,37 +92,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
/**
* Same as general constructor but allows one to override the default cacheSize
*
* @param fasta
* @param index
* @param cacheSize
* @param fasta the file we will read our FASTA sequence from.
* @param index the index of the fasta file, used for efficient random access
* @param cacheSize the size in bp of the cache we will use for this reader
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) {
super(fasta, index);
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
*
* @param fasta The file to open.
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
* @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
*/
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
this(fasta, index, DEFAULT_CACHE_SIZE);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
*
* Looks for a index file for fasta on disk
*
* @param fasta The file to open.
*/
public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException {
this(fasta, DEFAULT_CACHE_SIZE);
this.preserveCase = preserveCase;
}
/**
@ -124,12 +112,76 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* Uses provided cacheSize instead of the default
*
* @param fasta The file to open.
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException {
super(fasta);
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
this.preserveCase = preserveCase;
}
// /**
// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
// *
// * @param fasta The file to open.
// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
// */
// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) {
// this(fasta, index, DEFAULT_CACHE_SIZE);
// }
/**
* Same as general constructor but allows one to override the default cacheSize
*
* By default, this CachingIndexedFastaReader converts all incoming bases to upper case
*
* @param fasta the file we will read our FASTA sequence from.
* @param index the index of the fasta file, used for efficient random access
* @param cacheSize the size in bp of the cache we will use for this reader
*/
public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) {
this(fasta, index, cacheSize, false);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
*
* Looks for a index file for fasta on disk.
* This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood
*
* @param fasta The file to open.
*/
public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException {
this(fasta, false);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
*
* Looks for a index file for fasta on disk
*
* @param fasta The file to open.
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/
public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
this(fasta, DEFAULT_CACHE_SIZE, preserveCase);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
*
* Looks for a index file for fasta on disk
* Uses provided cacheSize instead of the default
*
* @param fasta The file to open.
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
*/
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
this(fasta, cacheSize, false);
}
/**
@ -168,6 +220,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
return cacheSize;
}
/**
* Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is
* everything being made upper case?
*
* @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased
*/
public boolean isPreservingCase() {
return preserveCase;
}
/**
* Is uppercasing bases?
*
* @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta
*/
public boolean isUppercasingBases() {
return ! isPreservingCase();
}
/**
* Gets the subsequence of the contig in the range [start,stop]
*
@ -177,8 +248,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param contig Contig whose subsequence to retrieve.
* @param start inclusive, 1-based start of region.
* @param stop inclusive, 1-based stop of region.
* @return The partial reference sequence associated with this range.
* @return The partial reference sequence associated with this range. If preserveCase is false, then
* all of the bases in the ReferenceSequence returned by this method will be upper cased.
*/
@Override
public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) {
final ReferenceSequence result;
final Cache myCache = cache.get();
@ -186,6 +259,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
if ( (stop - start) >= cacheSize ) {
cacheMisses++;
result = super.getSubsequenceAt(contig, start, stop);
if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
} else {
// todo -- potential optimization is to check if contig.name == contig, as this in generally will be true
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
@ -198,7 +272,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
myCache.start = Math.max(start - cacheMissBackup, 0);
myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength());
myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop);
//System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop);
// convert all of the bases in the sequence to upper case if we aren't preserving cases
if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
} else {
cacheHits++;
}
@ -215,8 +291,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
}
}
// for debugging -- print out our efficiency if requested
if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 )
printEfficiency(Priority.INFO);
return result;
}
}

View File

@ -199,7 +199,7 @@ public class RecalDatum {
@Override
public String toString() {
return String.format("%.2f,%,2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
}
public String stringForCSV() {

View File

@ -646,7 +646,7 @@ public class AlignmentUtils {
// get the indel element and move it left one base
CigarElement ce = cigar.getCigarElement(indexOfIndel - 1);
elements.add(new CigarElement(ce.getLength() - 1, ce.getOperator()));
elements.add(new CigarElement(Math.max(ce.getLength() - 1, 0), ce.getOperator()));
elements.add(cigar.getCigarElement(indexOfIndel));
if (indexOfIndel + 1 < cigar.numCigarElements()) {
ce = cigar.getCigarElement(indexOfIndel + 1);

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.HashMap;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: Nov 13
*/
public class BySampleSAMFileWriter extends NWaySAMFileWriter{
private final Map<String, SAMReaderID> sampleToWriterMap;
public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) {
super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records);
sampleToWriterMap = new HashMap<String, SAMReaderID>(toolkit.getSAMFileHeader().getReadGroups().size() * 2);
for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) {
for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) {
String sample = rg.getSample();
if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) {
throw new ReviewedStingException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead.");
}
else {
sampleToWriterMap.put(sample, readerID);
}
}
}
}
@Override
public void addAlignment(SAMRecord samRecord) {
super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample()));
}
}

View File

@ -1,186 +1,177 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.io.File;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: May 31, 2011
* Time: 3:52:49 PM
* To change this template use File | Settings | File Templates.
*/
public class NWaySAMFileWriter implements SAMFileWriter {
private Map<SAMReaderID,SAMFileWriter> writerMap = null;
private boolean presorted ;
GenomeAnalysisEngine toolkit;
boolean KEEP_ALL_PG_RECORDS = false;
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map<String,String> in2out, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) {
this.presorted = presorted;
this.toolkit = toolkit;
this.KEEP_ALL_PG_RECORDS = keep_records;
writerMap = new HashMap<SAMReaderID,SAMFileWriter>();
setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord);
}
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) {
this.presorted = presorted;
this.toolkit = toolkit;
this.KEEP_ALL_PG_RECORDS = keep_records;
writerMap = new HashMap<SAMReaderID,SAMFileWriter>();
setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord);
}
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map<String,String> in2out, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5) {
this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false);
}
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly , boolean generateMD5) {
this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false);
}
/**
* Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved
* from <code>toolkit</code>). The <code>in2out</code> map must contain an entry for each input filename and map it
* onto a unique output file name.
* @param toolkit
* @param in2out
*/
public void setupByReader(GenomeAnalysisEngine toolkit, Map<String,String> in2out, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) {
if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL");
for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) {
String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName();
String outName;
if ( ! in2out.containsKey(fName) )
throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName);
outName = in2out.get(fName);
if ( writerMap.containsKey( rid ) )
throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+
"map file likely contains multiple entries for this input file");
addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord);
}
}
/**
* Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved
* from <code>toolkit</code>). The output file names will be generated automatically by stripping ".sam" or ".bam" off the
* input file name and adding ext instead (e.g. ".cleaned.bam").
* onto a unique output file name.
* @param toolkit
* @param ext
*/
public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) {
for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) {
String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName();
String outName;
int pos ;
if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM");
else {
if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM");
else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam");
}
String prefix = fName.substring(0,pos);
outName = prefix+ext;
if ( writerMap.containsKey( rid ) )
throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered");
addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord);
}
}
private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted,
boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) {
File f = new File(outName);
SAMFileHeader header = toolkit.getSAMFileHeader(id).clone();
header.setSortOrder(order);
if ( programRecord != null ) {
// --->> add program record
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
for ( SAMProgramRecord record : oldRecords ) {
if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS )
newRecords.add(record);
}
newRecords.add(programRecord);
header.setProgramRecords(newRecords);
// <-- add program record ends here
}
SAMFileWriterFactory factory = new SAMFileWriterFactory();
factory.setCreateIndex(indexOnTheFly);
factory.setCreateMd5File(generateMD5);
SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f);
writerMap.put(id,sw);
}
public Collection<SAMFileWriter> getWriters() {
return writerMap.values();
}
public void addAlignment(SAMRecord samRecord) {
final SAMReaderID id = toolkit.getReaderIDForRead(samRecord);
String rg = samRecord.getStringAttribute("RG");
if ( rg != null ) {
String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg);
samRecord.setAttribute("RG",rg_orig);
}
writerMap.get(id).addAlignment(samRecord);
}
public SAMFileHeader getFileHeader() {
return toolkit.getSAMFileHeader();
}
public void close() {
for ( SAMFileWriter w : writerMap.values() ) w.close();
}
}
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: May 31, 2011
* Time: 3:52:49 PM
* To change this template use File | Settings | File Templates.
*/
public class NWaySAMFileWriter implements SAMFileWriter {
private Map<SAMReaderID,SAMFileWriter> writerMap = null;
private boolean presorted ;
GenomeAnalysisEngine toolkit;
boolean KEEP_ALL_PG_RECORDS = false;
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map<String,String> in2out, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) {
this.presorted = presorted;
this.toolkit = toolkit;
this.KEEP_ALL_PG_RECORDS = keep_records;
writerMap = new HashMap<SAMReaderID,SAMFileWriter>();
setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord);
}
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) {
this.presorted = presorted;
this.toolkit = toolkit;
this.KEEP_ALL_PG_RECORDS = keep_records;
writerMap = new HashMap<SAMReaderID,SAMFileWriter>();
setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord);
}
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map<String,String> in2out, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5) {
this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false);
}
public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly , boolean generateMD5) {
this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false);
}
/**
* Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved
* from <code>toolkit</code>). The <code>in2out</code> map must contain an entry for each input filename and map it
* onto a unique output file name.
* @param toolkit
* @param in2out
*/
public void setupByReader(GenomeAnalysisEngine toolkit, Map<String,String> in2out, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) {
if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL");
for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) {
String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName();
String outName;
if ( ! in2out.containsKey(fName) )
throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName);
outName = in2out.get(fName);
if ( writerMap.containsKey( rid ) )
throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+
"map file likely contains multiple entries for this input file");
addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord);
}
}
/**
* Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved
* from <code>toolkit</code>). The output file names will be generated automatically by stripping ".sam" or ".bam" off the
* input file name and adding ext instead (e.g. ".cleaned.bam").
* onto a unique output file name.
* @param toolkit
* @param ext
*/
public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order,
boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) {
for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) {
String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName();
String outName;
int pos ;
if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM");
else {
if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM");
else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam");
}
String prefix = fName.substring(0,pos);
outName = prefix+ext;
if ( writerMap.containsKey( rid ) )
throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered");
addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord);
}
}
private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted,
boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) {
File f = new File(outName);
SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord);
SAMFileWriterFactory factory = new SAMFileWriterFactory();
factory.setCreateIndex(indexOnTheFly);
factory.setCreateMd5File(generateMD5);
SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f);
writerMap.put(id,sw);
}
public Collection<SAMFileWriter> getWriters() {
return writerMap.values();
}
public void addAlignment(SAMRecord samRecord) {
final SAMReaderID id = toolkit.getReaderIDForRead(samRecord);
String rg = samRecord.getStringAttribute("RG");
if ( rg != null ) {
String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg);
samRecord.setAttribute("RG",rg_orig);
}
addAlignment(samRecord, id);
}
public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) {
writerMap.get(readerID).addAlignment(samRecord);
}
public SAMFileHeader getFileHeader() {
return toolkit.getSAMFileHeader();
}
public void close() {
for ( SAMFileWriter w : writerMap.values() ) w.close();
}
}

View File

@ -1071,15 +1071,17 @@ public class VariantContext implements Feature { // to enable tribble integratio
if ( g.isCalled() )
observedAlleles.addAll(g.getAlleles());
}
if ( observedAlleles.contains(Allele.NO_CALL) )
observedAlleles.remove(Allele.NO_CALL);
if ( reportedAlleles.size() != observedAlleles.size() )
throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart()));
throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart()));
int originalSize = reportedAlleles.size();
// take the intersection and see if things change
observedAlleles.retainAll(reportedAlleles);
if ( observedAlleles.size() != originalSize )
throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart()));
throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart()));
}
public void validateChromosomeCounts() {

View File

@ -76,7 +76,6 @@ public class BCF2FieldWriterManager {
if ( map.containsKey(field) )
throw new ReviewedStingException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders");
map.put(field, writer);
if ( logger.isDebugEnabled() ) logger.debug(writer);
}
// -----------------------------------------------------------------

View File

@ -0,0 +1,37 @@
package org.broadinstitute.sting;
import org.apache.log4j.Logger;
import org.testng.IAnnotationTransformer;
import org.testng.annotations.ITestAnnotation;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
/**
* Provide default @Test values for GATK testng tests.
*
* Currently only sets the maximum runtime to 10 minutes, if it's not been specified.
*
* See http://beust.com/weblog/2006/10/18/annotation-transformers-in-java/
*
* @author depristo
* @since 10/31/12
* @version 0.1
*/
public class TestNGTestTransformer implements IAnnotationTransformer {
public static final long DEFAULT_TIMEOUT = 1000 * 60 * 10; // 10 minutes max per test
final static Logger logger = Logger.getLogger(TestNGTestTransformer.class);
public void transform(ITestAnnotation annotation,
Class testClass,
Constructor testConstructor,
Method testMethod)
{
if ( annotation.getTimeOut() == 0 ) {
logger.warn("test " + testMethod.toString() + " has no specified timeout, adding default timeout " + DEFAULT_TIMEOUT / 1000 / 60 + " minutes");
annotation.setTimeOut(DEFAULT_TIMEOUT);
}
}
}

View File

@ -0,0 +1,126 @@
package org.broadinstitute.sting.gatk.traversals;
import org.testng.Assert;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* User: thibault
* Date: 11/13/12
* Time: 2:47 PM
*/
public class TraverseActiveRegionsTest extends BaseTest {
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
private final double prob;
public DummyActiveRegionWalker() {
this.prob = 1.0;
}
@Override
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return new ActivityProfileResult(ref.getLocus(), prob);
}
@Override
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
return 0;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}
private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegions<Integer, Integer>();
private IndexedFastaSequenceFile reference;
private GenomeLocParser genomeLocParser;
private ActiveRegionWalker<Integer, Integer> walker;
@BeforeClass
private void init() throws FileNotFoundException {
reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
SAMSequenceDictionary dictionary = reference.getSequenceDictionary();
genomeLocParser = new GenomeLocParser(dictionary);
}
@Test
public void testAllIntervalsSeen() throws Exception {
List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1);
intervals.add(interval);
LocusShardDataProvider dataProvider = createDataProvider(intervals);
t.traverse(walker, dataProvider, 0);
boolean allGenomeLocsSeen = true;
for (GenomeLoc loc : intervals) {
boolean thisGenomeLocSeen = false;
for (ActivityProfileResult active : t.profile.getActiveList()) {
if (loc.equals(active.getLoc())) {
thisGenomeLocSeen = true;
break;
}
}
if (!thisGenomeLocSeen) {
allGenomeLocsSeen = false;
break;
}
}
Assert.assertTrue(allGenomeLocsSeen, "Some intervals missing from activity profile");
}
private LocusShardDataProvider createDataProvider(List<GenomeLoc> intervals) {
walker = new DummyActiveRegionWalker();
StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList<SAMRecord>());
Shard shard = new MockLocusShard(genomeLocParser, intervals);
WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser,iterator,shard.getGenomeLocs());
WindowMaker.WindowMakerIterator window = windowMaker.next();
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
//engine.setReferenceDataSource(reference);
engine.setGenomeLocParser(genomeLocParser);
t.initialize(engine);
return new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>());
}
}

View File

@ -38,7 +38,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
public void testCallableLociWalkerBed() {
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
Arrays.asList("9e4ec9c23f21a8162d27a39ab057398c", SUMMARY_MD5));
Arrays.asList("42e86c06c167246a28bffdacaca75ffb", SUMMARY_MD5));
executeTest("formatBed", spec);
}
@ -46,7 +46,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
public void testCallableLociWalkerPerBase() {
String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
Arrays.asList("e6044b4495ef24f542403e6a94437068", SUMMARY_MD5));
Arrays.asList("d66c525d9c70f62df8156261d3e535ad", SUMMARY_MD5));
executeTest("format_state_per_base", spec);
}
@ -54,7 +54,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
public void testCallableLociWalker2() {
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
Arrays.asList("c671f65712d9575b8b3e1f1dbedc146e", "d287510eac04acf5a56f5cde2cba0e4a"));
Arrays.asList("330f476085533db92a9dbdb3a127c041", "d287510eac04acf5a56f5cde2cba0e4a"));
executeTest("formatBed by interval", spec);
}
@ -62,7 +62,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest {
public void testCallableLociWalker3() {
String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s";
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2,
Arrays.asList("b7d26a470ef906590249f2fa45fd6bdd", "da431d393f7c2b2b3e27556b86c1dbc7"));
Arrays.asList("46a53379aaaf9803276a0a34b234f6ab", "da431d393f7c2b2b3e27556b86c1dbc7"));
executeTest("formatBed lots of arguments", spec);
}
}

View File

@ -7,7 +7,7 @@ import java.util.ArrayList;
public class UnifiedGenotyperLargeScaleTest extends WalkerTest {
@Test
@Test( timeOut = 18000000 )
public void testUnifiedGenotyperWholeGenome() {
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + hg18Reference +
@ -22,7 +22,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest {
executeTest("testUnifiedGenotyperWholeGenome", spec);
}
@Test
@Test( timeOut = 18000000 )
public void testUnifiedGenotyperWholeExome() {
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + hg18Reference +
@ -37,7 +37,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest {
executeTest("testUnifiedGenotyperWholeExome", spec);
}
@Test
@Test( timeOut = 18000000 )
public void testUnifiedGenotyperWGParallel() {
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + hg18Reference +

View File

@ -0,0 +1,35 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
import org.testng.SkipException;
import org.testng.annotations.Test;
import java.util.Arrays;
// ********************************************************************************** //
// Note that this class also serves as an integration test for the VariantAnnotator! //
// ********************************************************************************** //
public class UnifiedGenotyperLiteIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
// --------------------------------------------------------------------------------------------------------------
//
// testing contamination down-sampling gets ignored
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testContaminationDownsampling() {
if ( !GATKLiteUtils.isGATKLite() )
throw new SkipException("Only want to test for GATK lite");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("9addd225a985178339a0c49dc5fdc220"));
executeTest("test contamination_percentage_to_filter gets ignored", spec);
}
}

View File

@ -6,7 +6,7 @@ import org.testng.annotations.Test;
import java.util.ArrayList;
public class IndelRealignerLargeScaleTest extends WalkerTest {
@Test
@Test( timeOut = 18000000 )
public void testHighCoverage() {
WalkerTestSpec spec = new WalkerTestSpec(
@ -21,7 +21,7 @@ public class IndelRealignerLargeScaleTest extends WalkerTest {
executeTest("testIndelRealignerHighCoverage", spec);
}
@Test
@Test( timeOut = 18000000 )
public void testRealigner() {
WalkerTestSpec spec1 = new WalkerTestSpec(

View File

@ -6,7 +6,7 @@ import org.testng.annotations.Test;
import java.util.ArrayList;
public class RealignerTargetCreatorLargeScaleTest extends WalkerTest {
@Test
@Test( timeOut = 18000000 )
public void testRealignerTargetCreator() {
WalkerTestSpec spec1 = new WalkerTestSpec(

View File

@ -354,7 +354,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
@Test
public void testCompOverlap() {
String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + variantEvalTestDataRoot + "pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + variantEvalTestDataRoot + "pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("59ad39e03678011b5f62492fa83ede04"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d0d9208060e69e157dac3bf01bdd83b0"));
executeTestParallel("testCompOverlap",spec);
}

View File

@ -26,9 +26,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
}
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
"f360ce3eb2b0b887301be917a9843e2b", // tranches
"287fea5ea066bf3fdd71f5ce9b58eab3", // recal file
"afa297c743437551cc2bd36ddd6d6d75"); // cut VCF
"4d08c8eee61dd1bdea8c5765f34e41f0", // tranches
"ce396fe4045e020b61471f6737dff36e", // recal file
"4f59bd61be900b25c6ecedaa68b9c8de"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
@ -75,9 +75,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
}
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"a8ce3cd3dccafdf7d580bcce7d660a9a", // tranches
"74c10fc15f9739a938b7138909fbde04", // recal file
"c30d163871a37f2bbf8ee7f761e870b4"); // cut VCF
"6a1eef4d02857dbb117a15420b5c0ce9", // tranches
"238366af66b05b6d21749e799c25353d", // recal file
"3928d6bc5007becf52312ade70f14c42"); // cut VCF
@DataProvider(name = "VRBCFTest")
public Object[][] createVRBCFTest() {

View File

@ -20,7 +20,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
+ b37hapmapGenotypes + " -disc " + testFile
+ " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",
1,
Arrays.asList("d88bdae45ae0e74e8d8fd196627e612c")
Arrays.asList("954415f84996d27b07d00855e96d33a2")
);
spec.disableShadowBCF();
@ -49,7 +49,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
+ b37hapmapGenotypes + " -disc " + testFile
+ " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",
1,
Arrays.asList("c0b937edb6a8b6392d477511d4f1ebcf")
Arrays.asList("ca1b5226eaeaffb78d4abd9d2ee10c43")
);
spec.disableShadowBCF();

View File

@ -33,6 +33,8 @@ import java.util.Arrays;
public class ValidateVariantsIntegrationTest extends WalkerTest {
protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e";
public static String baseTestString(String file, String type) {
return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + privateTestDir + file + " --validationType " + type;
}
@ -42,7 +44,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleGood.vcf", "ALL"),
0,
Arrays.asList("d41d8cd98f00b204e9800998ecf8427e")
Arrays.asList(emptyMd5)
);
executeTest("test good file", spec);
@ -53,7 +55,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "REF"),
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad ref base #1", spec);
@ -64,7 +66,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad2.vcf", "REF"),
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad ref base #2", spec);
@ -75,7 +77,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "CHR_COUNTS"),
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad chr counts #1", spec);
@ -86,7 +88,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad2.vcf", "CHR_COUNTS"),
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad chr counts #2", spec);
@ -97,7 +99,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129,
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad RS ID", spec);
@ -108,7 +110,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "ALLELES"),
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad alt allele", spec);
@ -119,18 +121,29 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad3.vcf", "REF"),
0,
UserException.MalformedFile.class
UserException.FailsStrictValidation.class
);
executeTest("test bad ref allele in deletion", spec);
}
@Test
public void testNoValidation() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "NONE"),
0,
Arrays.asList(emptyMd5)
);
executeTest("test no validation", spec);
}
@Test
public void testComplexEvents() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("complexEvents.vcf", "ALL"),
0,
Arrays.asList("d41d8cd98f00b204e9800998ecf8427e")
Arrays.asList(emptyMd5)
);
executeTest("test validating complex events", spec);

View File

@ -123,7 +123,7 @@ public class ActivityProfileUnitTest extends BaseTest {
for ( int i = 0; i < cfg.probs.size(); i++ ) {
double p = cfg.probs.get(i);
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
profile.add(loc, new ActivityProfileResult(p));
profile.add(new ActivityProfileResult(loc, p));
}
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));

View File

@ -30,6 +30,7 @@ import java.util.concurrent.Executors;
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
private static final int STEP_SIZE = 1;
private final static boolean DEBUG = false;
//private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100);
@ -53,9 +54,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested;
}
@Test(dataProvider = "fastas", enabled = true)
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize));
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
@ -64,6 +65,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
}
private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
@ -92,10 +95,10 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
}
// Tests grabbing sequences around a middle cached value.
@Test(dataProvider = "fastas", enabled = true)
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize));
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
@ -123,11 +126,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@DataProvider(name = "ParallelFastaTest")
public Object[][] createParallelFastaTest() {
List<Object[]> params = new ArrayList<Object[]>();
// for ( int nt : Arrays.asList(1, 2, 3) ) {
// for ( int cacheSize : CACHE_SIZES ) {
// params.add(new Object[]{simpleFasta, cacheSize, 10, nt});
// }
// }
for ( File fasta : Arrays.asList(simpleFasta) ) {
for ( int cacheSize : CACHE_SIZES ) {
@ -143,9 +141,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
}
@Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000)
@Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize));
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
for ( int iterations = 0; iterations < 1; iterations++ ) {
@ -163,4 +161,49 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
executor.shutdownNow();
}
}
// make sure some bases are lower case and some are upper case
@Test(enabled = true)
public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException {
final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true);
final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA));
int nMixedCase = 0;
for ( SAMSequenceRecord contig : original.getSequenceDictionary().getSequences() ) {
nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);
final int step = 100;
for ( int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step ) {
testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
}
}
Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state");
}
private int testCases(final IndexedFastaSequenceFile original,
final IndexedFastaSequenceFile casePreserving,
final IndexedFastaSequenceFile allUpper,
final String contig, final int start, final int stop ) {
final String orig = fetchBaseString(original, contig, start, stop);
final String keptCase = fetchBaseString(casePreserving, contig, start, stop);
final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase();
final String origToUpper = orig.toUpperCase();
if ( ! orig.equals(origToUpper) ) {
Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig);
Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig);
return 1;
} else {
return 0;
}
}
private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) {
if ( start == -1 )
return new String(reader.getSequence(contig).getBases());
else
return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
}
}

View File

@ -141,19 +141,19 @@ class GATKResourcesBundle extends QScript {
// CEU trio (NA12878,NA12891,NA12892) best practices results (including PBT)
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/callsets/CEUtrio_BestPractices/current/CEUTrio.HiSeq.WGS.b37.UG.snps_and_indels.recalibrated.filtered.phaseByTransmission.vcf",
"CEUTrio.HiSeq.WGS.b37.UG.bestPractices.phaseByTransmission",b37,true,false))
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/callsets/CEUtrio_BestPractices/CEUTrio.HiSeq.WGS.b37.snps_and_indels.recalibrated.filtered.phased.CURRENT.vcf",
"CEUTrio.HiSeq.WGS.b37.bestPractices.phased",b37,true,false))
//
// example call set for wiki tutorial
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf",
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf",
"NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true))
//
// Test BAM file, specific to each reference
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam",
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam",
"IGNORE", b37, false, false))
//
@ -317,6 +317,7 @@ class GATKResourcesBundle extends QScript {
class UG(@Input bam: File, @Input ref: File, @Input outVCF: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
this.input_file = List(bam)
this.reference_sequence = ref
this.intervalsString ++= List("20");
this.out = outVCF
}

View File

@ -24,7 +24,6 @@
package org.broadinstitute.sting.queue
import function.QFunction
import java.io.File
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.util._
@ -93,21 +92,25 @@ class QCommandLine extends CommandLineProgram with Logging {
private lazy val qScriptPluginManager = {
qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory)
qScriptManager.loadScripts(scripts, qScriptClasses)
new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL))
new PluginManager[QScript](qPluginType, Seq(qScriptClasses.toURI.toURL))
}
private lazy val qStatusMessengerPluginManager = {
new PluginManager[QStatusMessenger](classOf[QStatusMessenger])
private lazy val qCommandPlugin = {
new PluginManager[QCommandPlugin](classOf[QCommandPlugin])
}
ClassFieldCache.parsingEngine = new ParsingEngine(this)
private lazy val allCommandPlugins = qCommandPlugin.createAllTypes()
private lazy val qPluginType: Class[_ <: QScript] = {
allCommandPlugins.map(_.qScriptClass).headOption.getOrElse(classOf[QScript])
}
/**
* Takes the QScripts passed in, runs their script() methods, retrieves their generated
* functions, and then builds and runs a QGraph based on the dependencies.
*/
def execute = {
val allStatusMessengers = qStatusMessengerPluginManager.createAllTypes()
ClassFieldCache.parsingEngine = this.parser
if (settings.qSettings.runName == null)
settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName)
@ -115,18 +118,31 @@ class QCommandLine extends CommandLineProgram with Logging {
settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp")
qGraph.initializeWithSettings(settings)
for (statusMessenger <- allStatusMessengers) {
loadArgumentsIntoObject(statusMessenger)
for (commandPlugin <- allCommandPlugins) {
loadArgumentsIntoObject(commandPlugin)
}
for (statusMessenger <- allStatusMessengers) {
statusMessenger.started()
for (commandPlugin <- allCommandPlugins) {
if (commandPlugin.statusMessenger != null)
commandPlugin.statusMessenger.started()
}
qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq
// TODO: Default command plugin argument?
val remoteFileConverter = (
for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null))
yield commandPlugin.remoteFileConverter
).headOption.getOrElse(null)
if (remoteFileConverter != null)
loadArgumentsIntoObject(remoteFileConverter)
val allQScripts = qScriptPluginManager.createAllTypes()
for (script <- allQScripts) {
logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript])))
loadArgumentsIntoObject(script)
allCommandPlugins.foreach(_.initScript(script))
// TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now.
//if (settings.run)
script.pullInputs()
@ -137,10 +153,15 @@ class QCommandLine extends CommandLineProgram with Logging {
case e: Exception =>
throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e)
}
if (remoteFileConverter != null) {
if (remoteFileConverter.convertToRemoteEnabled)
script.mkRemoteOutputs(remoteFileConverter)
}
script.functions.foreach(qGraph.add(_))
logger.info("Added " + script.functions.size + " functions")
}
// Execute the job graph
qGraph.run()
@ -162,14 +183,19 @@ class QCommandLine extends CommandLineProgram with Logging {
if (!success) {
logger.info("Done with errors")
qGraph.logFailed()
for (statusMessenger <- allStatusMessengers)
statusMessenger.exit("Done with errors")
for (commandPlugin <- allCommandPlugins)
if (commandPlugin.statusMessenger != null)
commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts))
1
} else {
if (settings.run) {
allQScripts.foreach(_.pushOutputs())
for (statusMessenger <- allStatusMessengers)
statusMessenger.done(allQScripts.map(_.remoteOutputs))
for (commandPlugin <- allCommandPlugins)
if (commandPlugin.statusMessenger != null) {
val allInputs = allQScripts.map(_.remoteInputs)
val allOutputs = allQScripts.map(_.remoteOutputs)
commandPlugin.statusMessenger.done(allInputs, allOutputs)
}
}
0
}
@ -189,7 +215,7 @@ class QCommandLine extends CommandLineProgram with Logging {
override def getArgumentSources = {
var plugins = Seq.empty[Class[_]]
plugins ++= qScriptPluginManager.getPlugins
plugins ++= qStatusMessengerPluginManager.getPlugins
plugins ++= qCommandPlugin.getPlugins
plugins.toArray
}
@ -200,11 +226,10 @@ class QCommandLine extends CommandLineProgram with Logging {
override def getArgumentSourceName(source: Class[_]) = {
if (classOf[QScript].isAssignableFrom(source))
qScriptPluginManager.getName(source.asSubclass(classOf[QScript]))
else if (classOf[QStatusMessenger].isAssignableFrom(source))
qStatusMessengerPluginManager.getName(source.asSubclass(classOf[QStatusMessenger]))
else if (classOf[QCommandPlugin].isAssignableFrom(source))
qCommandPlugin.getName(source.asSubclass(classOf[QCommandPlugin]))
else
null
}
/**

View File

@ -0,0 +1,11 @@
package org.broadinstitute.sting.queue
import engine.QStatusMessenger
import util.RemoteFileConverter
trait QCommandPlugin {
def statusMessenger: QStatusMessenger = null
def remoteFileConverter: RemoteFileConverter = null
def qScriptClass: Class[_ <: QScript] = classOf[QScript]
def initScript(script: QScript) {}
}

View File

@ -108,6 +108,24 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon
functions.foreach( f => add(f) )
}
/**
* Convert all @Output files to remote output files.
* @param remoteFileConverter Converter for files to remote files.
*/
def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) {
for (field <- outputFields) {
val fieldFile = ClassFieldCache.getFieldFile(this, field)
if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) {
val fieldName = ClassFieldCache.fullName(field)
val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName)
ClassFieldCache.setFieldValue(this, field, remoteFile)
}
}
}
/**
* Pull all remote files to the local disk.
*/
def pullInputs() {
val inputs = ClassFieldCache.getFieldFiles(this, inputFields)
for (remoteFile <- filterRemoteFiles(inputs)) {
@ -116,6 +134,9 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon
}
}
/**
* Push all remote files from the local disk.
*/
def pushOutputs() {
val outputs = ClassFieldCache.getFieldFiles(this, outputFields)
for (remoteFile <- filterRemoteFiles(outputs)) {
@ -124,8 +145,25 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon
}
}
def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] =
outputFields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap
/**
* List out the remote outputs
* @return the RemoteFile outputs by argument source
*/
def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields))
/**
* List out the remote outputs
* @return the RemoteFile outputs by argument source
*/
def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields))
private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = {
remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap
}
private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = {
fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap
}
private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] =
fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile])

View File

@ -47,6 +47,7 @@ import java.io.{OutputStreamWriter, File}
*/
class QGraph extends Logging {
var settings: QGraphSettings = _
var messengers: Seq[QStatusMessenger] = Nil
private def dryRun = !settings.run
private var numMissingValues = 0
@ -95,7 +96,7 @@ class QGraph extends Logging {
* The settings aren't necessarily available until after this QGraph object has been constructed, so
* this function must be called once the QGraphSettings have been filled in.
*
* @param settings
* @param settings QGraphSettings
*/
def initializeWithSettings(settings: QGraphSettings) {
this.settings = settings
@ -430,6 +431,7 @@ class QGraph extends Logging {
val edge = readyJobs.head
edge.runner = newRunner(edge.function)
edge.start()
messengers.foreach(_.started(jobShortName(edge.function)))
startedJobs += edge
readyJobs -= edge
logNextStatusCounts = true
@ -465,8 +467,14 @@ class QGraph extends Logging {
updateStatus()
runningJobs.foreach(edge => edge.status match {
case RunnerStatus.DONE => doneJobs += edge
case RunnerStatus.FAILED => failedJobs += edge
case RunnerStatus.DONE => {
doneJobs += edge
messengers.foreach(_.done(jobShortName(edge.function)))
}
case RunnerStatus.FAILED => {
failedJobs += edge
messengers.foreach(_.exit(jobShortName(edge.function), edge.function.jobErrorLines.mkString("%n".format())))
}
case RunnerStatus.RUNNING => /* do nothing while still running */
})
@ -493,7 +501,7 @@ class QGraph extends Logging {
// incremental
if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) {
logger.info("Writing incremental jobs reports...")
writeJobsReport(false)
writeJobsReport(plot = false)
}
readyJobs ++= getReadyJobs
@ -516,9 +524,13 @@ class QGraph extends Logging {
private def nextRunningCheck(lastRunningCheck: Long) =
((30 * 1000L) - (System.currentTimeMillis - lastRunningCheck))
def formattedStatusCounts: String = {
"%d Pend, %d Run, %d Fail, %d Done".format(
statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done)
}
private def logStatusCounts() {
logger.info("%d Pend, %d Run, %d Fail, %d Done".format(
statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done))
logger.info(formattedStatusCounts)
}
/**
@ -533,6 +545,16 @@ class QGraph extends Logging {
traverseFunctions(edge => recheckDone(edge))
}
// TODO: Yet another field to add (with overloads) to QFunction?
private def jobShortName(function: QFunction): String = {
var name = function.analysisName
if (function.isInstanceOf[CloneFunction]) {
val cloneFunction = function.asInstanceOf[CloneFunction]
name += " %d of %d".format(cloneFunction.cloneIndex, cloneFunction.cloneCount)
}
name
}
/**
* First pass that checks if an edge is done or if it's an intermediate edge if it can be skipped.
* This function may modify the status of previous edges if it discovers that the edge passed in

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.queue.engine
import org.broadinstitute.sting.commandline.ArgumentSource
import org.broadinstitute.sting.queue.util.RemoteFile
/**
@ -8,6 +7,10 @@ import org.broadinstitute.sting.queue.util.RemoteFile
*/
trait QStatusMessenger {
def started()
def done(files: Seq[Map[ArgumentSource, Seq[RemoteFile]]])
def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]])
def exit(message: String)
def started(job: String)
def done(job: String)
def exit(job: String, message: String)
}

View File

@ -39,7 +39,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMe
private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]
override def freezeFieldValues() {
this.jarFile = this.originalGATK.jarFile
this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) }
this.out = this.originalOutput
GATKIntervals.copyIntervalArguments(this.originalGATK, this)

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.queue.extensions.picard
import org.broadinstitute.sting.commandline.{Argument, Output, Input}
import java.io.File
import net.sf.picard.analysis.MetricAccumulationLevel
/**
* Created with IntelliJ IDEA.
@ -10,9 +11,8 @@ import java.io.File
* Time: 5:59 PM
* To change this template use File | Settings | File Templates.
*/
class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction {
analysisName = "CalculateHsMetrics"
javaMainClass = "net.sf.picard.sam.CalculateHsMetrics"
@Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true)
var input: Seq[File] = Nil
@ -28,33 +28,15 @@ class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCom
@Argument(doc="Reference file", shortName = "reference", fullName = "reference", required = true)
var reference: File = _
/*
@Argument(doc = "Maximum number of file handles to keep open when spilling read ends to disk. Set this number a little lower than the per-process maximum number of file that may be open. This number can be found by executing the 'ulimit -n' command on a Unix system.", shortName = "max_file_handles", fullName ="max_file_handles_for_read_ends_maps", required=false)
var MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: Int = -1;
@Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false)
var SORTING_COLLECTION_SIZE_RATIO: Double = -1
*/
override def freezeFieldValues() {
super.freezeFieldValues()
// if (outputIndex == null && output != null)
// outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
}
val level = "SAMPLE"
val level = MetricAccumulationLevel.SAMPLE
override def inputBams = input
override def outputBam = output
//this.sortOrder = null
//this.createIndex = Some(true)
override def outputFile = output
override def commandLine = super.commandLine +
required("BAIT_INTERVALS=" + baits) +
required("TARGET_INTERVALS=" + targets) +
required("REFERENCE_SEQUENCE=" + reference) +
optional("METRIC_ACCUMULATION_LEVEL="+level)/*+
conditional(REMOVE_DUPLICATES, "REMOVE_DUPLICATES=true") +
conditional(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP > 0, "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=" + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP.toString) +
conditional(SORTING_COLLECTION_SIZE_RATIO > 0, "SORTING_COLLECTION_SIZE_RATIO=" + SORTING_COLLECTION_SIZE_RATIO.toString) */
optional("METRIC_ACCUMULATION_LEVEL="+level)
}

View File

@ -10,9 +10,8 @@ import java.io.File
* Time: 10:37 AM
* To change this template use File | Settings | File Templates.
*/
class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
analysisName = "CalculateGcMetrics"
javaMainClass = "net.sf.picard.sam.CalculateGcMetrics"
class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction {
analysisName = "CollectGcBiasMetrics"
@Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true)
var input: Seq[File] = Nil
@ -24,8 +23,9 @@ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaC
var reference: File = _
override def inputBams = input
override def outputBam = output
override def outputFile = output
override def commandLine = super.commandLine +
required("SUMMARY_OUTPUT=" + output) +
required("CHART_OUTPUT=" + output+".pdf") +
required("REFERENCE_SEQUENCE=" + reference) +
required("ASSUME_SORTED=true")

View File

@ -10,9 +10,8 @@ import java.io.File
* Time: 10:37 AM
* To change this template use File | Settings | File Templates.
*/
class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction{
analysisName = "CalculateMultipleMetrics"
javaMainClass = "net.sf.picard.sam.CalculateMultipleMetrics"
class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction{
analysisName = "CollectMultipleMetrics"
@Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true)
var input: Seq[File] = Nil
@ -24,7 +23,7 @@ class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.Jav
var reference: File = _
override def inputBams = input
override def outputBam = output
override def outputFile = output
override def commandLine = super.commandLine +
required("REFERENCE_SEQUENCE=" + reference) +
required("ASSUME_SORTED=true") +

View File

@ -0,0 +1,53 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.extensions.picard
import java.io.File
import org.broadinstitute.sting.queue.function.JavaCommandLineFunction
import net.sf.samtools.SAMFileReader.ValidationStringency
import net.sf.samtools.SAMFileHeader.SortOrder
/**
* Wraps a Picard function that operates on BAM files but doesn't output a new BAM file (i.e. QC metric files).
* See http://picard.sourceforge.net/ for more info.
*
* Since the various BAM utilities take slightly different arguments
* some values are optional.
*/
trait PicardMetricsFunction extends JavaCommandLineFunction {
var validationStringency = ValidationStringency.SILENT
var maxRecordsInRam: Option[Int] = None
var assumeSorted: Option[Boolean] = None
protected def inputBams: Seq[File]
protected def outputFile: File
abstract override def commandLine = super.commandLine +
repeat("INPUT=", inputBams, spaceSeparated=false) +
required("TMP_DIR=" + jobTempDir) +
optional("VALIDATION_STRINGENCY=", validationStringency, spaceSeparated=false) +
optional("OUTPUT=", outputFile, spaceSeparated=false) +
optional("MAX_RECORDS_IN_RAM=", maxRecordsInRam, spaceSeparated=false) +
optional("ASSUME_SORTED=", assumeSorted, spaceSeparated=false)
}

View File

@ -38,6 +38,7 @@ object CloneFunction {
class CloneFunction extends CommandLineFunction {
var originalFunction: ScatterGatherableFunction = _
var cloneIndex: Int = _
var cloneCount: Int = _
private var overriddenFields = Map.empty[ArgumentSource, Any]
private var withScatterPartCount = 0

View File

@ -176,6 +176,7 @@ trait ScatterGatherableFunction extends CommandLineFunction {
cloneFunction.originalFunction = this
cloneFunction.analysisName = this.analysisName
cloneFunction.cloneIndex = i
cloneFunction.cloneCount = numClones
cloneFunction.commandDirectory = this.scatterGatherTempDir(dirFormat.format(i))
cloneFunction.jobOutputFile = if (IOUtils.isSpecialFile(this.jobOutputFile)) this.jobOutputFile else new File(this.jobOutputFile.getName)
if (this.jobErrorFile != null)

View File

@ -180,4 +180,15 @@ object ClassFieldCache {
case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown))
}
//
// other utilities
//
/**
* Retrieves the fullName of the argument
* @param field ArgumentSource to check
* @return Full name of the argument source
*/
def fullName(field: ArgumentSource) = field.createArgumentDefinitions().get(0).fullName
}

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.queue.util
import java.io.File
trait RemoteFileConverter {
type RemoteFileType <: RemoteFile
/**
* If this remote file creator is capable of converting to a remote file.
* @return true if ready to convert
*/
def convertToRemoteEnabled: Boolean
/**
* Converts to a remote file
* @param file The original file
* @param name A "name" to use for the remote file
* @return The new version of this remote file.
*/
def convertToRemote(file: File, name: String): RemoteFileType
}

View File

@ -28,7 +28,7 @@ import org.testng.annotations.Test
import org.broadinstitute.sting.BaseTest
class DataProcessingPipelineTest {
@Test
@Test(timeOut=36000000)
def testSimpleBAM {
val projectName = "test1"
val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam"
@ -45,7 +45,7 @@ class DataProcessingPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testBWAPEBAM {
val projectName = "test2"
val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam"
@ -60,7 +60,7 @@ class DataProcessingPipelineTest {
" -bwa /home/unix/carneiro/bin/bwa",
" -bwape ",
" -p " + projectName).mkString
spec.fileMD5s += testOut -> "6e70efbe6bafc3fedd60bd406bd201db"
spec.fileMD5s += testOut -> "9fca827ecc8436465b831bb6f879357a"
PipelineTest.executeTest(spec)
}

View File

@ -28,7 +28,7 @@ import org.testng.annotations.Test
import org.broadinstitute.sting.BaseTest
class PacbioProcessingPipelineTest {
@Test
@Test(timeOut=36000000)
def testPacbioProcessingPipeline {
val testOut = "exampleBAM.recal.bam"
val spec = new PipelineTestSpec
@ -40,7 +40,7 @@ class PacbioProcessingPipelineTest {
" -blasr ",
" -test ",
" -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf").mkString
spec.fileMD5s += testOut -> "61b06e8b78a93e6644657e6d38851084"
spec.fileMD5s += testOut -> "b84f9c45e045685067ded681d5e6224c"
PipelineTest.executeTest(spec)
}
}

View File

@ -53,7 +53,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class DevNullOutputPipelineTest {
@Test
@Test(timeOut=36000000)
def testDevNullOutput() {
val spec = new PipelineTestSpec
spec.name = "devnulloutput"

View File

@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleCountLociPipelineTest {
@Test
@Test(timeOut=36000000)
def testCountLoci() {
val testOut = "count.out"
val spec = new PipelineTestSpec

View File

@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleCountReadsPipelineTest {
@Test
@Test(timeOut=36000000)
def testCountReads() {
val spec = new PipelineTestSpec
spec.name = "countreads"

View File

@ -77,7 +77,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleReadFilterPipelineTest {
@Test
@Test(timeOut=36000000)
def testExampleReadFilter() {
val spec = new PipelineTestSpec
spec.name = "examplereadfilter"

View File

@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleRetryMemoryLimitPipelineTest {
@Test
@Test(timeOut=36000000)
def testRetryMemoryLimit() {
val spec = new PipelineTestSpec
spec.name = "RetryMemoryLimit"

View File

@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleUnifiedGenotyperPipelineTest {
@Test
@Test(timeOut=36000000)
def testUnifiedGenotyper() {
val spec = new PipelineTestSpec
spec.name = "unifiedgenotyper"
@ -51,7 +51,7 @@ class ExampleUnifiedGenotyperPipelineTest {
Array("vcf_intervals", BaseTest.validationDataLocation + "intervalTest.1.vcf")
).asInstanceOf[Array[Array[Object]]]
@Test(dataProvider = "ugIntervals")
@Test(dataProvider = "ugIntervals", timeOut=36000000)
def testUnifiedGenotyperWithIntervals(intervalsName: String, intervalsPath: String) {
val spec = new PipelineTestSpec
spec.name = "unifiedgenotyper_with_" + intervalsName
@ -64,7 +64,7 @@ class ExampleUnifiedGenotyperPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testUnifiedGenotyperNoGCOpt() {
val spec = new PipelineTestSpec
spec.name = "unifiedgenotyper_no_gc_opt"
@ -80,7 +80,7 @@ class ExampleUnifiedGenotyperPipelineTest {
@DataProvider(name="resMemReqParams")
def getResMemReqParam = Array(Array("mem_free"), Array("virtual_free")).asInstanceOf[Array[Array[Object]]]
@Test(dataProvider = "resMemReqParams")
@Test(dataProvider = "resMemReqParams", timeOut=36000000)
def testUnifiedGenotyperResMemReqParam(reqParam: String) {
val spec = new PipelineTestSpec
spec.name = "unifiedgenotyper_" + reqParam

View File

@ -28,7 +28,7 @@ import org.testng.annotations.Test
import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
class HelloWorldPipelineTest {
@Test
@Test(timeOut=36000000)
def testHelloWorld() {
val spec = new PipelineTestSpec
spec.name = "HelloWorld"
@ -37,7 +37,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithRunName() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithRunName"
@ -47,7 +47,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithMemoryLimit() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldMemoryLimit"
@ -57,7 +57,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithPriority() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithPriority"
@ -67,7 +67,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithLsfResource() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithLsfResource"
@ -77,7 +77,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithLsfResourceAndMemoryLimit() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithLsfResourceAndMemoryLimit"
@ -87,7 +87,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithLsfEnvironment() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithLsfEnvironment"
@ -97,7 +97,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithGridEngineResource() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithGridEngineResource"
@ -107,7 +107,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithGridEngineResourceAndMemoryLimit() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit"
@ -117,7 +117,7 @@ class HelloWorldPipelineTest {
PipelineTest.executeTest(spec)
}
@Test
@Test(timeOut=36000000)
def testHelloWorldWithGridEngineEnvironment() {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithGridEngineEnvironment"

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broad" module="tribble" revision="110" status="integration" />
<info organisation="org.broad" module="tribble" revision="119" status="integration" />
</ivy-module>