From 6984ab0c78b29e57b1cf6c9cb68cafb58316279a Mon Sep 17 00:00:00 2001
From: Menachem Fromer
Date: Tue, 8 Jan 2013 16:40:32 -0500
Subject: [PATCH 001/113] Change ReadBackedPhasing to be VCF4.1 compliant (but
instead of using PS tags, it uses HP tags for pairs of diploid haplotype
labels). The important point is that it no longer uses the '|' character
instead of '/' or flips around the allele order in the genotype. Instead, the
HP FORMAT targ is used to mark the haplotype labels with respect to other
genotypes for the same sample. Note that this enables the phasing of
non-consecutive (but nearby) sites, based on mate pairs, for example. Also,
updated the HaplotypeCallerValidation Qscript to perform PhaseByTransmission
(if ped file given) and then ReadBackedPhasing
---
.../sting/gatk/walkers/phasing/Haplotype.java | 18 +
.../walkers/phasing/ReadBackedPhasing.java | 332 ++++++++++--------
2 files changed, 213 insertions(+), 137 deletions(-)
mode change 100755 => 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java
index 61d5a725e..9e7838647 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java
@@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
class Haplotype extends BaseArray implements Cloneable {
public Haplotype(byte[] bases) {
@@ -68,4 +71,19 @@ class Haplotype extends BaseArray implements Cloneable {
public Haplotype subHaplotype(int fromIndex, int toIndex) {
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
}
+
+ public Haplotype subHaplotype(Set inds) {
+ List basesList = new LinkedList();
+ for (int i : inds) {
+ if (0 <= i && i < bases.length)
+ basesList.add(bases[i]);
+ }
+
+ Byte[] newBases = new Byte[basesList.size()];
+ int index = 0;
+ for (Byte b : basesList)
+ newBases[index++] = b;
+
+ return new Haplotype(newBases);
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
old mode 100755
new mode 100644
index 68eab9889..ef84ed10b
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
@@ -137,6 +137,7 @@ public class ReadBackedPhasing extends RodWalker can trivially phase a hom site relative to ANY previous site:
- Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make();
- uvc.setGenotype(samp, phasedGt);
- }
- else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
- PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
- if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
- SNPallelePair allelePair = new SNPallelePair(gt);
- if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
+ if (gt.isHet()) { // Attempt to phase this het genotype relative to *SOME* previous het genotype:
- CloneableIteratorLinkedList.CloneableIterator prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
- /* Notes:
- 1. Call to next() advances iterator to next position in partiallyPhasedSites.
- 2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads.
- */
- UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant;
- Genotype prevHetGenotype = prevUvc.getGenotype(samp);
+ // Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites):
+ List prevHetGenotypes = new LinkedList();
+ CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator();
+ while (phasedIt.hasNext()) {
+ UnfinishedVariantAndReads phasedVr = phasedIt.next();
+ Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp);
+ if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) {
+ GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation());
+ prevHetGenotypes.add(grb);
+ if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
+ }
+ }
+
+ SNPallelePair allelePair = new SNPallelePair(gt);
+ if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
+
+ boolean phasedCurGenotypeRelativeToPrevious = false;
+ for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
+ PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
PhaseResult pr = phaseSampleAtSite(phaseWindow);
- boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
+ phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
if (pr.phasingContainsInconsistencies) {
if (DEBUG)
@@ -406,44 +412,43 @@ public class ReadBackedPhasing extends RodWalker prevHetAndInteriorIt = null;
+
+ private int phaseRelativeToIndex = -1;
private int phasingSiteIndex = -1;
+
private Map readsAtHetSites = null;
- private void clearFields() {
- hetGenotypes = null;
- prevHetAndInteriorIt = null;
- phasingSiteIndex = -1;
- readsAtHetSites = null;
- }
-
- public boolean hasPreviousHets() {
- return phasingSiteIndex > 0;
+ public Genotype phaseRelativeToGenotype() {
+ return hetGenotypes[phaseRelativeToIndex];
}
// ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)]
- public PhasingWindow(VariantAndReads vr, String sample) {
- List listHetGenotypes = new LinkedList();
+ public PhasingWindow(VariantAndReads vr, String sample, List prevHetGenotypes, int goBackFromEndOfPrevHets) {
+ if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase
+ throw new ReviewedStingException("Should never get empty set of previous sites to phase against");
- // Include previously phased sites in the phasing computation:
- CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator();
- while (phasedIt.hasNext()) {
- UnfinishedVariantAndReads phasedVr = phasedIt.next();
- Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample);
- if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break"
- listHetGenotypes.clear(); // clear out any history
- }
- else if (gt.isHet()) {
- GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation());
- listHetGenotypes.add(grb);
- if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
- prevHetAndInteriorIt = phasedIt.clone();
- }
- }
+ // Include these previously phased sites in the phasing computation:
+ List listHetGenotypes = new LinkedList(prevHetGenotypes);
+
+ phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets;
phasingSiteIndex = listHetGenotypes.size();
- if (phasingSiteIndex == 0) { // no previous sites against which to phase
- clearFields();
- return;
- }
- prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()]
- if (respectPhaseInInput) {
- Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample);
- if (!prevHetGenotype.isPhased()) {
- // Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]:
- clearFields();
- return;
- }
- }
-
- // Add the (het) position to be phased:
+ // Add the (het) position to be phased [at phasingSiteIndex]:
GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant);
GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus);
listHetGenotypes.add(grbPhase);
- if (DEBUG)
- logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
+ if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
// Include as-of-yet unphased sites in the phasing computation:
for (VariantAndReads nextVr : unphasedSiteQueue) {
if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc
break;
Genotype gt = nextVr.variant.getGenotype(sample);
- if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break"
- break;
- }
- else if (gt.isHet()) {
+ if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) {
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant));
listHetGenotypes.add(grb);
if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc);
@@ -571,7 +557,7 @@ public class ReadBackedPhasing extends RodWalker maxPhaseSites) {
listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus);
@@ -585,8 +571,7 @@ public class ReadBackedPhasing extends RodWalker keepReads = new HashSet();
- /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
+ /* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex):
In detail:
Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph".
@@ -759,7 +744,7 @@ public class ReadBackedPhasing extends RodWalker keepHetSites = new LinkedList();
int index = 0;
- int numPrecedingRemoved = 0;
+ int numPrecedingPhaseRelativeToSiteRemoved = 0;
+ int numPrecedingPhasingSiteRemoved = 0;
for (GenotypeAndReadBases grb : listHetGenotypes) {
boolean keepSite = sitesWithReads.contains(index);
if (DEBUG && logger.isDebugEnabled() && !keepSite)
logger.debug("Removing read-less site " + grb.loc);
- if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) {
+ if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) {
keepHetSites.add(grb);
if (!keepSite)
if (DEBUG)
logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]...");
}
- else if (index <= phasingSiteIndex)
- numPrecedingRemoved++;
+ else {
+ if (index <= phaseRelativeToIndex)
+ numPrecedingPhaseRelativeToSiteRemoved++;
+ if (index <= phasingSiteIndex)
+ numPrecedingPhasingSiteRemoved++;
+ }
index++;
}
- phasingSiteIndex -= numPrecedingRemoved;
+ phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved;
+ phasingSiteIndex -= numPrecedingPhasingSiteRemoved;
return keepHetSites;
}
+ private class SortSitesBySumOfDist implements Comparator {
+ private Vector grb;
+
+ public SortSitesBySumOfDist(List listHetGenotypes) {
+ grb = new Vector(listHetGenotypes);
+ }
+
+ public int compare(Integer i1, Integer i2) {
+ int d1 = calcGenomicDist(i1);
+ int d2 = calcGenomicDist(i2);
+
+ if (d1 != d2)
+ return d1 - d2;
+
+ int id1 = calcIndexDist(i1);
+ int id2 = calcIndexDist(i2);
+ if (id1 != id2)
+ return id1 - id2;
+
+ return i1 - i2;
+ }
+
+ private int calcGenomicDist(int i) {
+ int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc);
+ int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc);
+
+ return d1 + d2;
+ }
+
+ private int calcIndexDist(int i) {
+ int d1 = Math.abs(i - phaseRelativeToIndex);
+ int d2 = Math.abs(i - phasingSiteIndex);
+
+ return d1 + d2;
+ }
+ }
+
private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) {
if (DEBUG)
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
- int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes
- int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites!
-
- int numOnLeft = prevSiteIndex;
- int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1);
-
- int useOnLeft, useOnRight;
- if (numOnLeft <= numOnRight) {
- int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side
- useOnLeft = Math.min(halfToUse, numOnLeft);
- useOnRight = Math.min(numToUse - useOnLeft, numOnRight);
+ Set scoreAllIndices = new TreeSet(new SortSitesBySumOfDist(listHetGenotypes));
+ for (int i = 0; i < listHetGenotypes.size(); ++i) {
+ if (i != phaseRelativeToIndex && i != phasingSiteIndex)
+ scoreAllIndices.add(i);
}
- else { // numOnRight < numOnLeft
- int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil]
- useOnRight = Math.min(halfToUse, numOnRight);
- useOnLeft = Math.min(numToUse - useOnRight, numOnLeft);
+
+ Set keepIndices = new TreeSet();
+ // always keep these two indices:
+ keepIndices.add(phaseRelativeToIndex);
+ keepIndices.add(phasingSiteIndex);
+ for (int addInd : scoreAllIndices) {
+ if (keepIndices.size() >= maxPhaseSites)
+ break;
+ else // keepIndices.size() < maxPhaseSites
+ keepIndices.add(addInd);
}
- int startIndex = prevSiteIndex - useOnLeft;
- int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
- phasingSiteIndex -= startIndex;
- listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex);
+
+ List newListHetGenotypes = new LinkedList();
+ int newPhaseRelativeToIndex = -1;
+ int newPhasingSiteIndex = -1;
+ int oldIndex = 0;
+ int newIndex = 0;
+ for (GenotypeAndReadBases grb : listHetGenotypes) {
+ if (keepIndices.contains(oldIndex)) {
+ newListHetGenotypes.add(grb);
+
+ if (oldIndex == phaseRelativeToIndex)
+ newPhaseRelativeToIndex = newIndex;
+ if (oldIndex == phasingSiteIndex)
+ newPhasingSiteIndex = newIndex;
+
+ ++newIndex;
+ }
+ ++oldIndex;
+ }
+
+ phaseRelativeToIndex = newPhaseRelativeToIndex;
+ phasingSiteIndex = newPhasingSiteIndex;
+ listHetGenotypes = newListHetGenotypes;
if (DEBUG)
logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
@@ -900,7 +946,8 @@ public class ReadBackedPhasing extends RodWalker 2!");
+ String[] curPairNames = prevPairNames;
+
byte prevBase = hap.getBase(0); // The 1st base in the haplotype
byte curBase = hap.getBase(1); // The 2nd base in the haplotype
boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase);
boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase);
- if (chosePrevTopChrom != choseCurTopChrom)
- curAllelePair.swapAlleles();
+ if (chosePrevTopChrom != choseCurTopChrom) {
+ //curAllelePair.swapAlleles();
+
+ /* Instead of swapping the alleles (as we used to above),
+ we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype:
+ */
+ curPairNames = new String[]{prevPairNames[1], prevPairNames[0]};
+ }
+
+ return curPairNames;
}
private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) {
@@ -1036,8 +1093,8 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds;
- public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) {
+ public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) {
super(hetGenotypes);
this.SNPallelePairs = new SNPallelePair[genotypes.length];
for (int i = 0; i < genotypes.length; i++)
SNPallelePairs[i] = new SNPallelePair(genotypes[i]);
- this.startIndex = startIndex;
- this.marginalizeLength = marginalizeLength;
+ this.marginalizeInds = new TreeSet();
+ for (int mind : marginalizeInds)
+ this.marginalizeInds.add(mind);
}
public PhasingTable getNewTable() {
+ int startIndex = marginalizeInds.iterator().next();
+
PhasingTable table = new PhasingTable();
for (Haplotype hap : getAllHaplotypes()) {
if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) {
@@ -1313,8 +1372,7 @@ public class ReadBackedPhasing extends RodWalker
Date: Wed, 9 Jan 2013 14:44:22 -0500
Subject: [PATCH 002/113] Added more options to control RBP behavior; raised
default RBP PQ threshold to 20
---
.../sting/gatk/walkers/phasing/ReadBackedPhasing.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
index ef84ed10b..d0b5fc5bd 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
@@ -117,7 +117,7 @@ public class ReadBackedPhasing extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
+ protected Double phaseQualityThresh = 20.0; // PQ = 20.0 <=> P(error) = 10^(-20/10) = 0.01, P(correct) = 0.99
@Hidden
@Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files [For DEBUGGING purposes only - DO NOT USE!]", required = false)
From 5577715ae25c7b668366daefbdb99b53626f4cd0 Mon Sep 17 00:00:00 2001
From: Menachem Fromer
Date: Wed, 6 Mar 2013 10:18:23 -0500
Subject: [PATCH 003/113] Have all significant XHMM commands be run with
LongRunTime
---
.../sting/queue/qscripts/CNV/xhmmCNVpipeline.scala | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
index 7dd771873..9bb031c38 100644
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
@@ -196,7 +196,7 @@ class xhmmCNVpipeline extends QScript {
}
addAll(docs)
- val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit
+ val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit with LongRunTime
add(mergeDepths)
var excludeTargets : List[File] = List[File]()
@@ -305,7 +305,7 @@ class xhmmCNVpipeline extends QScript {
}
}
- class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit {
+ class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime {
@Input(doc = "")
val input = inputParam
@@ -335,7 +335,7 @@ class xhmmCNVpipeline extends QScript {
override def description = "Filters samples and targets and then mean-centers the targets: " + command
}
- class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
+ class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime {
@Input(doc = "")
val input = inputParam
@@ -358,7 +358,7 @@ class xhmmCNVpipeline extends QScript {
override def description = "Runs PCA on mean-centered data: " + command
}
- class Normalize(pca: PCA) extends CommandLineFunction {
+ class Normalize(pca: PCA) extends CommandLineFunction with LongRunTime {
@Input(doc = "")
val input = pca.input
@@ -387,7 +387,7 @@ class xhmmCNVpipeline extends QScript {
override def description = "Normalizes mean-centered data using PCA information: " + command
}
- class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
+ class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime {
@Input(doc = "")
val input = inputParam
@@ -413,7 +413,7 @@ class xhmmCNVpipeline extends QScript {
override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command
}
- class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit {
+ class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime {
@Input(doc = "")
val input = inputParam
From 13240588cf70052c7139fa3e30b54d7fd071286f Mon Sep 17 00:00:00 2001
From: Menachem Fromer
Date: Mon, 6 May 2013 13:52:14 -0400
Subject: [PATCH 005/113] Fix to only consider the samples that are both in the
PED file and in the VCF file
---
.../sting/gatk/walkers/phasing/PhaseByTransmission.java | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index a4c1caf86..87e9a6ea0 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -450,9 +450,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
//Get the trios from the families passed as ped
- setTrios();
+ setTrios(vcfSamples);
if(trios.size()<1)
- throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted.");
+ throw new UserException.BadInput("No PED file passed or no *non-skipped* trios found in PED file. Aborted.");
Set headerLines = new HashSet();
@@ -471,9 +471,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
/**
* Select trios and parent/child pairs only
*/
- private void setTrios(){
+ private void setTrios(Set vcfSamples){
- Map> families = this.getSampleDB().getFamilies();
+ Map> families = this.getSampleDB().getFamilies(vcfSamples);
Set family;
ArrayList parents;
for(Map.Entry> familyEntry : families.entrySet()){
From c7dcc2b53bba03a2024919b467316a60dadc9de9 Mon Sep 17 00:00:00 2001
From: Menachem Fromer
Date: Mon, 6 May 2013 15:47:27 -0400
Subject: [PATCH 006/113] Fix to deal with multi-generational families being
allowed if only one level (one 'trio', effectively) appears in the VCF
---
.../gatk/walkers/phasing/PhaseByTransmission.java | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 87e9a6ea0..7bbc4e981 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -478,6 +478,16 @@ public class PhaseByTransmission extends RodWalker, HashMa
ArrayList parents;
for(Map.Entry> familyEntry : families.entrySet()){
family = familyEntry.getValue();
+
+ // Since getFamilies(vcfSamples) above still returns parents of samples in the VCF even if those parents are not in the VCF, need to subset down here:
+ Set familyMembersInVCF = new TreeSet();
+ for(Sample familyMember : family){
+ if (vcfSamples.contains(familyMember.getID())) {
+ familyMembersInVCF.add(familyMember);
+ }
+ }
+ family = familyMembersInVCF;
+
if(family.size()<2 || family.size()>3){
logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey(),family.size()));
}
@@ -488,8 +498,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(family.containsAll(parents))
this.trios.add(familyMember);
else
- logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey()));
- break;
+ logger.info(String.format("Caution: Child %s of family %s skipped as info is not provided as a complete trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Child skipped.", familyMember.getID(), familyEntry.getKey()));
}
}
}
From e33d3dafc6554c4fea705e62dabb93115d457240 Mon Sep 17 00:00:00 2001
From: Menachem Fromer
Date: Fri, 3 Jan 2014 12:04:47 -0500
Subject: [PATCH 007/113] Add documentation for RBP, and also update the MD5
for the tests now that the output uses HP tags instead of '|', which is now
reserved for trio-based phasing
---
.../walkers/phasing/ReadBackedPhasing.java | 47 +++++++++++++++----
.../ReadBackedPhasingIntegrationTest.java | 26 ++++------
2 files changed, 47 insertions(+), 26 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
index a28fc83b5..7ed77b845 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
@@ -82,6 +82,12 @@ import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersF
/**
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
*
+ * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites.
+ *
+ * The underlying algorithm is based on building up 2^n local haplotypes,
+ * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter).
+ * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag).
+ *
*
* Performs physical phasing of SNP calls, based on sequencing reads.
*
@@ -161,13 +167,6 @@ public class ReadBackedPhasing extends RodWalker unphasedSiteQueue = null;
@@ -327,6 +326,7 @@ public class ReadBackedPhasing extends RodWalker processQueue(PhasingStats phaseStats, boolean processAll) {
List oldPhasedList = new LinkedList();
@@ -362,6 +362,7 @@ public class ReadBackedPhasing extends RodWalker discardIrrelevantPhasedSites() {
List vcList = new LinkedList();
@@ -517,6 +518,7 @@ public class ReadBackedPhasing extends RodWalker= phaseQualityThresh;
}
+ // A genotype and the base pileup that supports it
private static class GenotypeAndReadBases {
public Genotype genotype;
public ReadBasesAtPosition readBases;
@@ -529,6 +531,7 @@ public class ReadBackedPhasing extends RodWalker listHetGenotypes, String sample, GenomeLoc phasingLoc) {
buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null);
}
@@ -650,6 +654,7 @@ public class ReadBackedPhasing extends RodWalker> edgeReads;
@@ -695,6 +700,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousReads(int numHetSites) {
PhasingGraph readGraph = new PhasingGraph(numHetSites);
EdgeToReads edgeToReads = new EdgeToReads();
@@ -840,6 +846,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousSites(List listHetGenotypes) {
Set sitesWithReads = new HashSet();
for (Map.Entry nameToReads : readsAtHetSites.entrySet()) {
@@ -879,6 +886,10 @@ public class ReadBackedPhasing extends RodWalker {
private Vector grb;
@@ -916,6 +927,7 @@ public class ReadBackedPhasing extends RodWalker trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) {
if (DEBUG)
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
@@ -966,6 +978,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases;
@@ -1214,6 +1229,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases;
@@ -1313,6 +1329,9 @@ public class ReadBackedPhasing extends RodWalker hapMap = new TreeMap();
for (PhasingTable.PhasingTableEntry pte : table) {
@@ -1366,6 +1388,7 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds;
@@ -1412,6 +1435,9 @@ public class ReadBackedPhasing extends RodWalker {
private LinkedList table;
@@ -1464,6 +1491,7 @@ public class ReadBackedPhasing extends RodWalker {
private HaplotypeClass haplotypeClass;
private PhasingScore score;
@@ -1528,6 +1557,7 @@ public class ReadBackedPhasing extends RodWalker multipleBaseCounts = null;
@@ -1644,6 +1674,7 @@ class HaplotypeClass implements Iterable {
}
}
+// Summary statistics about phasing rates, for each sample
class PhasingStats {
private int numReads;
private int numVarSites;
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java
index 9759004a0..8c8817fe6 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java
@@ -72,7 +72,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:332341-382503",
1,
- Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c"));
+ Arrays.asList("1bb034bd54421fe4884e3142ed92d47e"));
executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec);
}
@@ -82,7 +82,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:1232503-1332503",
1,
- Arrays.asList("a3ca151145379e0d4bae64a91165ea0b"));
+ Arrays.asList("c12954252d4c8659b5ecf7517b277496"));
executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec);
}
@@ -92,7 +92,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30)
+ " -L chr20:332341-382503",
1,
- Arrays.asList("f685803333123a102ce1851d984cbd10"));
+ Arrays.asList("0b945e30504d04e9c6fa659ca5c25ed5"));
executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec);
}
@@ -102,7 +102,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100)
+ " -L chr20:332341-382503",
1,
- Arrays.asList("aaa7c25d118383639f273128d241e140"));
+ Arrays.asList("e9e8ef92d694ca71f29737fba26282f5"));
executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec);
}
@@ -112,7 +112,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10)
+ " -L chr20:332341-482503",
1,
- Arrays.asList("418e29400762972e77bae4f73e16befe"));
+ Arrays.asList("b9c9347c760a06db635952bf4920fb48"));
executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec);
}
@@ -122,7 +122,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:652810-681757",
1,
- Arrays.asList("4c8f6190ecc86766baba3aba08542991"));
+ Arrays.asList("02c3a903842aa035ae379f16bc3d64ae"));
executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec);
}
@@ -132,18 +132,8 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10)
+ " -L chr20:332341-802503",
1,
- Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34"));
- executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec);
- }
-
- @Test
- public void test8() {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10)
- + " -L chr20:332341-802503" + " -respectPhaseInInput",
- 1,
- Arrays.asList("e3549b89d49092e73cc6eb21f233471c"));
- executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec);
+ Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574"));
+ executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec);
}
}
From c133909d328b2759a5ca6c703e02774f740c7421 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 2 Jan 2014 15:35:46 -0500
Subject: [PATCH 008/113] Fixed edge condition in the realigner where a
realigned read can sometimes get partially aligned off the end of the contig.
Now we ignore such reads (which is much easier than trying to figure out when to soft-clip).
Added unit test.
---
.../gatk/walkers/indels/IndelRealigner.java | 46 +++++++++--
.../indels/IndelRealignerUnitTest.java | 82 +++++++++++++++++++
2 files changed, 122 insertions(+), 6 deletions(-)
create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
index c0848663e..a9b14e40b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@@ -1548,16 +1548,28 @@ public class IndelRealigner extends ReadWalker {
return false;
}
- // annotate the record with the original cigar (and optionally the alignment start)
- if ( !NO_ORIGINAL_ALIGNMENT_TAGS ) {
- read.setAttribute(ORIGINAL_CIGAR_TAG, read.getCigar().toString());
- if ( newStart != read.getAlignmentStart() )
- read.setAttribute(ORIGINAL_POSITION_TAG, read.getAlignmentStart());
- }
+ // store the old CIGAR and start in case we need to back out
+ final Cigar oldCigar = read.getCigar();
+ final int oldStart = read.getAlignmentStart();
+ // try updating the read with the new CIGAR and start
read.setCigar(newCigar);
read.setAlignmentStart(newStart);
+ // back out if necessary
+ if ( realignmentProducesBadAlignment(read) ) {
+ read.setCigar(oldCigar);
+ read.setAlignmentStart(oldStart);
+ return false;
+ }
+
+ // annotate the record with the original cigar and start (if it changed)
+ if ( !NO_ORIGINAL_ALIGNMENT_TAGS ) {
+ read.setAttribute(ORIGINAL_CIGAR_TAG, oldCigar.toString());
+ if ( newStart != oldStart )
+ read.setAttribute(ORIGINAL_POSITION_TAG, oldStart);
+ }
+
return true;
}
@@ -1578,6 +1590,28 @@ public class IndelRealigner extends ReadWalker {
}
}
+ /**
+ * Determines whether the read aligns off the end of the contig
+ *
+ * @param read the read to check
+ * @return true if it aligns off the end
+ */
+ private boolean realignmentProducesBadAlignment(final GATKSAMRecord read) {
+ final int contigLength = referenceReader.getSequenceDictionary().getSequence(currentInterval.getContig()).getSequenceLength();
+ return realignmentProducesBadAlignment(read, contigLength);
+ }
+
+ /**
+ * Determines whether the read aligns off the end of the contig.
+ * Pulled out to make it testable.
+ *
+ * @param read the read to check
+ * @return true if it aligns off the end
+ */
+ protected static boolean realignmentProducesBadAlignment(final GATKSAMRecord read, final int contigLength) {
+ return read.getAlignmentEnd() > contigLength;
+ }
+
private static class Consensus {
public final byte[] str;
public final ArrayList> readIndexes;
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java
new file mode 100644
index 000000000..509bf7465
--- /dev/null
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerUnitTest.java
@@ -0,0 +1,82 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.indels;
+
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.SAMFileHeader;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.testng.Assert;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+
+public class IndelRealignerUnitTest extends BaseTest {
+
+ private SAMFileHeader header;
+
+ @BeforeClass
+ public void setup() throws FileNotFoundException {
+ final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
+ header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
+ }
+
+ @Test
+ public void realignAtContigBorderTest() {
+ final int contigEnd = header.getSequence(0).getSequenceLength();
+ final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "goodRead", 0, contigEnd - 1, 2);
+ read.setCigarString("2M");
+ Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), false);
+ read.setCigarString("1M1D1M");
+ Assert.assertEquals(IndelRealigner.realignmentProducesBadAlignment(read, contigEnd), true);
+ }
+
+}
From f172c349f614918bc238b7c22bc4449ba023a136 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Mon, 6 Jan 2014 16:12:37 -0500
Subject: [PATCH 009/113] Adding the functionality to enable users to input a
file of VCFs for -V.
To do this I have added a RodBindingCollection which can represent either a VCF or a
file of VCFs. Note that e.g. SelectVariants allows a list of RodBindingCollections so
that one can intermix VCFs and VCF lists.
For VariantContext tags with a list, by default the tags for the -V argument are applied
unless overridden by the individual line. In other words, any given line can have either
one token (the file path) or two tokens (the new tags and the file path). For example:
foo.vcf
VCF,name=bar bar.vcf
Note that a VCF list file name must end with '.list'.
Added this functionality to CombineVariants, CombineReferenceCalculationVariants, and VariantRecalibrator.
---
.../VariantRecalibrator.java | 9 +-
.../CombineReferenceCalculationVariants.java | 9 +-
.../commandline/ArgumentTypeDescriptor.java | 140 +++++++++++++++++-
.../sting/commandline/IntervalBinding.java | 8 +-
.../sting/commandline/ParsingEngine.java | 1 +
.../sting/commandline/RodBinding.java | 4 +-
.../commandline/RodBindingCollection.java | 89 +++++++++++
.../walkers/variantutils/CombineVariants.java | 7 +-
.../RodBindingCollectionUnitTest.java | 126 ++++++++++++++++
9 files changed, 378 insertions(+), 15 deletions(-)
create mode 100644 public/java/src/org/broadinstitute/sting/commandline/RodBindingCollection.java
create mode 100644 public/java/test/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
index 5da7b4219..d43dc4a12 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
-import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.Utils;
@@ -155,7 +154,8 @@ public class VariantRecalibrator extends RodWalker> input;
+ public List> inputCollections;
+ final private List> input = new ArrayList<>();
/**
* These additional calls should be unfiltered and annotated with the error covariates that are intended to be used for modeling.
@@ -272,7 +272,6 @@ public class VariantRecalibrator extends RodWalker hInfo = new HashSet<>();
ApplyRecalibration.addVQSRStandardHeaderLines(hInfo);
recalWriter.writeHeader( new VCFHeader(hInfo) );
@@ -280,6 +279,10 @@ public class VariantRecalibrator extends RodWalker inputCollection : inputCollections )
+ input.addAll(inputCollection.getRodBindings());
}
//---------------------------------------------------------------------------------------------------------------
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java
index a587b0250..2a004aaca 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java
@@ -111,13 +111,12 @@ import java.util.*;
@Reference(window=@Window(start=-10,stop=10))
public class CombineReferenceCalculationVariants extends RodWalker implements AnnotatorCompatible, TreeReducible {
- // TODO -- allow a file of VCF paths to be entered?
-
/**
* The VCF files to merge together
*/
@Input(fullName="variant", shortName = "V", doc="One or more input VCF files", required=true)
- public List> variants;
+ public List> variantCollections;
+ final private List> variants = new ArrayList<>();
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
@@ -169,6 +168,10 @@ public class CombineReferenceCalculationVariants extends RodWalkeremptyList(), this, getToolkit());
+
+ // collect the actual rod bindings into a list for use later
+ for ( final RodBindingCollection variantCollection : variantCollections )
+ variants.addAll(variantCollection.getRodBindings());
}
public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index a70d6e706..14b5118ad 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -25,6 +25,7 @@
package org.broadinstitute.sting.commandline;
+import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
@@ -36,6 +37,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
+import java.io.IOException;
import java.lang.annotation.Annotation;
import java.lang.reflect.*;
import java.util.*;
@@ -306,6 +308,7 @@ public abstract class ArgumentTypeDescriptor {
* @param source source
* @param type type to check
* @param matches matches
+ * @param tags argument tags
* @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding.
*/
protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) {
@@ -409,6 +412,95 @@ public abstract class ArgumentTypeDescriptor {
value, fieldName, e.getMessage()));
}
}
+
+ /**
+ * Parse the source of a RodBindingCollection, which can be either a file of RodBindings or an actual RodBinding.
+ *
+ * @param parsingEngine the parsing engine used to validate this argument type descriptor
+ * @param source source
+ * @param type type
+ * @param matches matches
+ * @param tags argument tags
+ * @return the newly created binding object
+ */
+ public Object parseRodBindingCollectionSource(final ParsingEngine parsingEngine,
+ final ArgumentSource source,
+ final Type type,
+ final ArgumentMatches matches,
+ final Tags tags) {
+
+ final ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
+ final ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
+ @SuppressWarnings("unchecked")
+ Class extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
+ String name = defaultDefinition.fullName;
+
+ // if this a list of files, get those bindings
+ final File file = value.asFile();
+ try {
+ if (file.getAbsolutePath().endsWith(".list")) {
+ return getRodBindingsCollection(file, parsingEngine, parameterType, name, tags, source.field.getName());
+ }
+ } catch (IOException e) {
+ throw new UserException.CouldNotReadInputFile(file, e);
+ }
+
+ // otherwise, treat this as an individual binding
+ final RodBinding binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, name, tags, source.field.getName());
+ parsingEngine.addTags(binding, tags);
+ parsingEngine.addRodBinding(binding);
+ return RodBindingCollection.createRodBindingCollectionOfType(parameterType, Arrays.asList(binding));
+ }
+
+ /**
+ * Retrieve and parse a collection of RodBindings from the given file.
+ *
+ * @param file the source file
+ * @param parsingEngine the engine responsible for parsing
+ * @param parameterType the Tribble Feature parameter type
+ * @param bindingName the name of the binding passed to the constructor.
+ * @param defaultTags general tags for the binding used for parsing and passed to the constructor.
+ * @param fieldName the name of the field that was parsed. Used for error reporting.
+ * @return the newly created collection of binding objects.
+ */
+ public static Object getRodBindingsCollection(final File file,
+ final ParsingEngine parsingEngine,
+ final Class extends Feature> parameterType,
+ final String bindingName,
+ final Tags defaultTags,
+ final String fieldName) throws IOException {
+ final List bindings = new ArrayList<>();
+
+ // parse each line separately using the given Tags if none are provided on each line
+ for ( final String line: FileUtils.readLines(file) ) {
+ final String[] tokens = line.split("\\s+");
+ final RodBinding binding;
+
+ if ( tokens.length == 0 ) {
+ continue; // empty line, so do nothing
+ }
+ // use the default tags if none are provided for this binding
+ else if ( tokens.length == 1 ) {
+ final ArgumentMatchValue value = new ArgumentMatchStringValue(tokens[0]);
+ binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, defaultTags, fieldName);
+ parsingEngine.addTags(binding, defaultTags);
+ }
+ // use the new tags if provided
+ else if ( tokens.length == 2 ) {
+ final Tags tags = ParsingMethod.parseTags(fieldName, tokens[0]);
+ final ArgumentMatchValue value = new ArgumentMatchStringValue(tokens[1]);
+ binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, tags, fieldName);
+ parsingEngine.addTags(binding, tags);
+ } else {
+ throw new UserException.BadArgumentValue(fieldName, "data lines should consist of an optional set of tags along with a path to a file; too many tokens are present for line: " + line);
+ }
+
+ bindings.add(binding);
+ parsingEngine.addRodBinding(binding);
+ }
+
+ return RodBindingCollection.createRodBindingCollectionOfType(parameterType, bindings);
+ }
}
/**
@@ -487,14 +579,60 @@ class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
}
+/**
+ * Parser for RodBindingCollection objects
+ */
+class RodBindingCollectionArgumentTypeDescriptor extends ArgumentTypeDescriptor {
+ /**
+ * We only want RodBindingCollection class objects
+ * @param type The type to check.
+ * @return true if the provided class is an RodBindingCollection.class
+ */
+ @Override
+ public boolean supports( final Class type ) {
+ return isRodBindingCollection(type);
+ }
+
+ public static boolean isRodBindingCollection( final Class type ) {
+ return RodBindingCollection.class.isAssignableFrom(type);
+ }
+
+ /**
+ * See note from RodBindingArgumentTypeDescriptor.parse().
+ *
+ * @param parsingEngine parsing engine
+ * @param source source
+ * @param type type to check
+ * @param matches matches
+ * @return the IntervalBinding object.
+ */
+ @Override
+ public Object parse(final ParsingEngine parsingEngine, final ArgumentSource source, final Type type, final ArgumentMatches matches) {
+ final Tags tags = getArgumentTags(matches);
+ return parseRodBindingCollectionSource(parsingEngine, source, type, matches, tags);
+ }
+}
+
/**
* Parse simple argument types: java primitives, wrapper classes, and anything that has
* a simple String constructor.
*/
class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
+
+ /**
+ * @param type the class type
+ * @return true if this class is a binding type, false otherwise
+ */
+ private boolean isBinding(final Class type) {
+ return RodBindingArgumentTypeDescriptor.isRodBinding(type) ||
+ IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ||
+ RodBindingCollectionArgumentTypeDescriptor.isRodBindingCollection(type);
+ }
+
+
@Override
public boolean supports( Class type ) {
- if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) || IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ) return false;
+ if ( isBinding(type) ) return false;
if ( type.isPrimitive() ) return true;
if ( type.isEnum() ) return true;
if ( primitiveToWrapperMap.containsValue(type) ) return true;
diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
index 9253e1ee5..de57de871 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
@@ -57,7 +57,7 @@ public final class IntervalBinding {
@Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"})
public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) {
- featureIntervals = new RodBinding(type, rawName, source, tribbleType, tags);
+ featureIntervals = new RodBinding<>(type, rawName, source, tribbleType, tags);
}
@Requires({"intervalArgument != null"})
@@ -66,9 +66,7 @@ public final class IntervalBinding {
}
public String getSource() {
- if ( featureIntervals != null )
- return featureIntervals.getSource();
- return stringIntervals;
+ return ( featureIntervals != null ? featureIntervals.getSource() : stringIntervals );
}
public List getIntervals(final GenomeAnalysisEngine toolkit) {
@@ -79,7 +77,7 @@ public final class IntervalBinding {
List intervals;
if ( featureIntervals != null ) {
- intervals = new ArrayList();
+ intervals = new ArrayList<>();
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
index aca20d5a1..ad64aaa1d 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
@@ -83,6 +83,7 @@ public class ParsingEngine {
private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(),
new IntervalBindingArgumentTypeDescriptor(),
new RodBindingArgumentTypeDescriptor(),
+ new RodBindingCollectionArgumentTypeDescriptor(),
new CompoundArgumentTypeDescriptor(),
new MultiplexArgumentTypeDescriptor()) );
diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java
index ef8e01df4..87fa85858 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java
@@ -32,7 +32,7 @@ import org.broad.tribble.Feature;
import java.util.*;
/**
- * A RodBinding representing a walker argument that gets bound to a ROD track.
+ * A RodBinding represents a walker argument that gets bound to a ROD track.
*
* The RodBinding is a formal GATK argument that bridges between a walker and
* the RefMetaDataTracker to obtain data about this rod track at runtime. The RodBinding
@@ -77,7 +77,7 @@ public final class RodBinding {
final private String tribbleType;
/** The command line tags associated with this RodBinding */
final private Tags tags;
- /** The Java class expected for this RodBinding. Must correspond to the type emited by Tribble */
+ /** The Java class expected for this RodBinding. Must correspond to the type emitted by Tribble */
final private Class type;
/** True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments */
final private boolean bound;
diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBindingCollection.java b/public/java/src/org/broadinstitute/sting/commandline/RodBindingCollection.java
new file mode 100644
index 000000000..d8306ea5a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/RodBindingCollection.java
@@ -0,0 +1,89 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.commandline;
+
+import com.google.java.contract.Ensures;
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.*;
+
+/**
+ * A RodBindingCollection represents a collection of RodBindings.
+ *
+ * The RodBindingCollection is a formal GATK argument that is used to specify a file of RodBindings.
+ *
+ */
+public final class RodBindingCollection {
+
+ /** The Java class expected for this RodBinding. Must correspond to the type emitted by Tribble */
+ final private Class type;
+
+ private Collection> rodBindings;
+
+ public RodBindingCollection(final Class type, final Collection> rodBindings) {
+ this.type = type;
+ this.rodBindings = Collections.unmodifiableCollection(rodBindings);
+ }
+
+ /**
+ * @return the collection of RodBindings
+ */
+ final public Collection> getRodBindings() {
+ return rodBindings;
+ }
+
+ /**
+ * @return the string name of the tribble type, such as vcf, bed, etc.
+ */
+ @Ensures({"result != null"})
+ final public Class getType() {
+ return type;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("(RodBindingCollection %s)", getRodBindings());
+ }
+
+ /**
+ * Utility method to help construct a RodBindingCollection of the given Feature type
+ *
+ * @param type the Feature type
+ * @param rodBindings the rod bindings to put into the collection
+ * @return a new RodBindingCollection object
+ */
+ public static Object createRodBindingCollectionOfType(final Class extends Feature> type, final Collection rodBindings) {
+ try {
+ final Constructor ctor = RodBindingCollection.class.getConstructor(Class.class, Collection.class);
+ return ctor.newInstance(type, rodBindings);
+ } catch (final Exception e) {
+ throw new IllegalStateException("Failed to create a RodBindingCollection for type " + type);
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
index e13252d49..152128022 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@@ -131,7 +131,8 @@ public class CombineVariants extends RodWalker implements Tree
* a rod priority list is provided.
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
- public List> variants;
+ public List> variantCollections;
+ final private List> variants = new ArrayList<>();
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
@@ -230,6 +231,10 @@ public class CombineVariants extends RodWalker implements Tree
VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER);
vcfWriter.writeHeader(vcfHeader);
+
+ // collect the actual rod bindings into a list for use later
+ for ( final RodBindingCollection variantCollection : variantCollections )
+ variants.addAll(variantCollection.getRodBindings());
}
private void validateAnnotateUnionArguments() {
diff --git a/public/java/test/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java
new file mode 100644
index 000000000..29d38ec19
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java
@@ -0,0 +1,126 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.commandline;
+
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.testng.Assert;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collection;
+
+public class RodBindingCollectionUnitTest extends BaseTest {
+
+ private ParsingEngine parsingEngine;
+ private Tags mytags;
+
+ private static final String defaultTagString = "VCF";
+ private static final String testVCFFileName = privateTestDir + "empty.vcf";
+ private static final String testListFileName = privateTestDir + "oneVCF.list";
+
+ @BeforeMethod
+ public void setUp() {
+ parsingEngine = new ParsingEngine(null);
+ RodBinding.resetNameCounter();
+ mytags = new Tags();
+ mytags.addPositionalTag(defaultTagString);
+ }
+
+ private class RodBindingCollectionArgProvider {
+ @Argument(fullName="input",doc="input",shortName="V")
+ public RodBindingCollection input;
+ }
+
+ @Test
+ public void testStandardVCF() {
+ final String[] commandLine = new String[] {"-V", testVCFFileName};
+
+ parsingEngine.addArgumentSource( RodBindingCollectionArgProvider.class );
+ parsingEngine.parse( commandLine );
+ parsingEngine.validate();
+
+ final RodBindingCollectionArgProvider argProvider = new RodBindingCollectionArgProvider();
+ parsingEngine.loadArgumentsIntoObject( argProvider );
+
+ Assert.assertEquals(argProvider.input.getRodBindings().iterator().next().getSource(), testVCFFileName, "Argument is not correctly initialized");
+ }
+
+ @Test
+ public void testList() {
+ final String[] commandLine = new String[] {"-V", testListFileName};
+
+ parsingEngine.addArgumentSource(RodBindingCollectionArgProvider.class);
+ parsingEngine.parse( commandLine );
+ parsingEngine.validate();
+
+ final RodBindingCollectionArgProvider argProvider = new RodBindingCollectionArgProvider();
+ parsingEngine.loadArgumentsIntoObject( argProvider );
+
+ Assert.assertEquals(argProvider.input.getRodBindings().iterator().next().getSource(), "private/testdata/empty.vcf", "Argument is not correctly initialized");
+ }
+
+ @Test
+ public void testDefaultTagsInFile() throws IOException {
+
+ final File testFile = File.createTempFile("RodBindingCollectionUnitTest.defaultTags", ".list");
+ testFile.deleteOnExit();
+ final FileWriter writer = new FileWriter(testFile);
+ writer.write(testVCFFileName, 0, testVCFFileName.length());
+ writer.close();
+
+ ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input");
+
+ final Collection bindings = parsingEngine.getRodBindings();
+ Assert.assertNotNull(bindings);
+ Assert.assertEquals(bindings.size(), 1);
+
+ final RodBinding binding = bindings.iterator().next();
+ Assert.assertEquals(parsingEngine.getTags(binding), mytags);
+ }
+
+ @Test
+ public void testOverrideTagsInFile() throws IOException {
+ final File testFile = File.createTempFile("RodBindingCollectionUnitTest.overrideTags", ".list");
+ testFile.deleteOnExit();
+ final FileWriter writer = new FileWriter(testFile);
+ final String textToWrite = "foo " + testVCFFileName;
+ writer.write(textToWrite, 0, textToWrite.length());
+ writer.close();
+
+ ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input");
+
+ final Collection bindings = parsingEngine.getRodBindings();
+ Assert.assertNotNull(bindings);
+ Assert.assertEquals(bindings.size(), 1);
+
+ final RodBinding binding = bindings.iterator().next();
+ Assert.assertNotEquals(parsingEngine.getTags(binding), mytags);
+ }
+}
From 0323caefc87b9f82ddaa0a114e7cd769971b5727 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Tue, 7 Jan 2014 15:50:21 -0500
Subject: [PATCH 010/113] Added some bug fixes to the gVCF merging code after
finally getting some real data to play with. Still under construction,
awaiting more test data from Valentin.
---
.../variant/GATKVariantContextUtils.java | 38 +++++++++----------
.../GATKVariantContextUtilsUnitTest.java | 7 ++--
2 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java
index c36d7f888..7d4d66f7c 100644
--- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java
@@ -33,6 +33,7 @@ import org.broad.tribble.TribbleException;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.vcf.VCFConstants;
@@ -1064,7 +1065,6 @@ public class GATKVariantContextUtils {
final Set inconsistentAttributes = new HashSet<>();
final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
- VariantContext longestVC = first;
int depth = 0;
final Map> annotationMap = new LinkedHashMap<>();
GenotypesContext genotypes = GenotypesContext.create();
@@ -1084,10 +1084,6 @@ public class GATKVariantContextUtils {
if ( isSpanningEvent )
continue;
- // keep track of the longest location that starts here
- if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) )
- longestVC = vc;
-
// special case ID (just preserve it)
if ( vc.hasID() ) rsIDs.add(vc.getID());
@@ -1105,15 +1101,15 @@ public class GATKVariantContextUtils {
if ( depth > 0 )
attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));
+ // remove stale AC and AF based attributes
+ removeStaleAttributesAfterMerge(attributes);
+
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles)
- .loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd())
+ .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(alleles, loc.getStart())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later
- // remove stale AC and AF based attributes
- removeStaleAttributesAfterMerge(builder);
-
return builder.make();
}
@@ -1147,16 +1143,17 @@ public class GATKVariantContextUtils {
}
/**
- * Remove the stale attributes from the merged VariantContext (builder)
+ * Remove the stale attributes from the merged set
*
- * @param builder the VC builder
+ * @param attributes the attribute map
*/
- private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) {
- builder.rmAttributes(Arrays.asList(VCFConstants.ALLELE_COUNT_KEY,
- VCFConstants.ALLELE_FREQUENCY_KEY,
- VCFConstants.ALLELE_NUMBER_KEY,
- VCFConstants.MLE_ALLELE_COUNT_KEY,
- VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
+ private static void removeStaleAttributesAfterMerge(final Map attributes) {
+ attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
+ attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
+ attributes.remove(VCFConstants.ALLELE_NUMBER_KEY);
+ attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY);
+ attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
+ attributes.remove(VCFConstants.END_KEY);
}
/**
@@ -1544,7 +1541,7 @@ public class GATKVariantContextUtils {
final String name = g.getSampleName();
if ( !mergedGenotypes.containsSample(name) ) {
// we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways
- final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles);
+ final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart());
final int[] PLs = generatePLs(g, indexesOfRelevantAlleles);
// note that we set the alleles to null here (as we expect it to be re-genotyped)
final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make();
@@ -1559,15 +1556,16 @@ public class GATKVariantContextUtils {
*
* @param remappedAlleles the list of alleles to evaluate
* @param targetAlleles the target list of alleles
+ * @param position position to use for error messages
* @return non-null array of ints representing indexes
*/
- protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles) {
+ protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) {
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) )
- throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records");
+ throw new UserException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE);
final int[] indexMapping = new int[targetAlleles.size()];
diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java
index 23a24e180..6672e3264 100644
--- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java
@@ -29,6 +29,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.variant.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
@@ -1463,14 +1464,14 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
}
}
- @Test(expectedExceptions = IllegalArgumentException.class)
+ @Test(expectedExceptions = UserException.class)
public void testGetIndexesOfRelevantAllelesWithNoALT() {
final List alleles1 = new ArrayList<>(1);
alleles1.add(Allele.create("A", true));
final List alleles2 = new ArrayList<>(1);
alleles2.add(Allele.create("A", true));
- GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2);
+ GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
Assert.fail("We should have thrown an exception because the allele was not present");
}
@@ -1502,7 +1503,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
if ( allelesIndex > 0 )
myAlleles.add(allAlleles.get(allelesIndex));
- final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles);
+ final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
Assert.assertEquals(indexes.length, allAlleles.size());
From 851ec67bdc878c0f810cb2ca7cb4f227bf2e98a2 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Mon, 13 Jan 2014 14:36:02 -0500
Subject: [PATCH 013/113] Adding more meta information about the user to the
GATK logging output, per Tim F's request.
---
.../sting/utils/help/HelpFormatter.java | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java
index d700bff28..f2e3fad4b 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java
@@ -30,6 +30,7 @@ import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
+import java.net.InetAddress;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.*;
@@ -295,6 +296,7 @@ public class HelpFormatter {
String output = sourceName + " Args: " + entry.getValue().getDescription();
logger.info(output);
}
+ logger.info(generateUserHelpData());
logger.info("Date/Time: " + dateFormat.format(date));
logger.info(barrier);
@@ -303,6 +305,23 @@ public class HelpFormatter {
logger.info(barrier);
}
+ /**
+ * Create the user-related help information.
+ * @return a non-null, non-empty String with the relevant information.
+ */
+ private static String generateUserHelpData() {
+ try {
+ return "Executing as " +
+ System.getProperty("user.name") + "@" + InetAddress.getLocalHost().getHostName() +
+ " on " + System.getProperty("os.name") + " " + System.getProperty("os.version") +
+ " " + System.getProperty("os.arch") + "; " + System.getProperty("java.vm.name") +
+ " " + System.getProperty("java.runtime.version") + ".";
+ } catch (Exception e) {
+ // don't fail
+ return "";
+ }
+ }
+
/**
* Create a barrier to use to distinguish the header from the rest of the output.
* @param text A collection of lines to output as part of a header.
From c7e08965d067bb7e0ef325fa1338ca4c121d1a60 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Fri, 10 Jan 2014 01:36:39 -0500
Subject: [PATCH 014/113] The QD normalization for indels was busted and is now
fixed.
It is true that indels of length > 1 have higher QUALS than those of length = 1. But for the HC those
QUALS are not that much higher, and it doesn't continue scaling up as the indels get larger. So we no
longer normalize by indel length (which massively over-penalizes larger events and effectively drops their
QD to 0).
For the UG the previous normalization also wasn't perfect. Now we divide the indel length by a factor
of 3 to make sure that QD is consistent over the range of indel lengths.
Integration tests change because QD is different for indels.
Also, got permission from Valentin to archive a failing test that no longer applies.
Thanks to Kurt on the GATK forum for pointing this all out.
---
.../gatk/walkers/annotator/QualByDepth.java | 22 ++++++++++++++++---
.../VariantAnnotatorIntegrationTest.java | 8 +++----
...perGeneralPloidySuite1IntegrationTest.java | 6 ++---
...perGeneralPloidySuite2IntegrationTest.java | 2 +-
...dGenotyperIndelCallingIntegrationTest.java | 16 +++++++-------
.../UnifiedGenotyperIntegrationTest.java | 10 ++++-----
...GenotyperNormalCallingIntegrationTest.java | 4 ++--
...dGenotyperReducedReadsIntegrationTest.java | 2 +-
...lexAndSymbolicVariantsIntegrationTest.java | 2 +-
.../HaplotypeCallerGVCFIntegrationTest.java | 6 ++---
.../HaplotypeCallerIntegrationTest.java | 22 +++++++++----------
11 files changed, 57 insertions(+), 43 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
index 906cfa021..106e4b443 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
@@ -108,7 +108,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
continue;
depth += perReadAlleleLikelihoods.getNumberOfStoredElements();
- } else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed
+ } else if ( genotype.hasDP() ) {
depth += genotype.getDP();
}
}
@@ -117,13 +117,29 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
return null;
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
- double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength);
+ // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
+ // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
+ double QD = -10.0 * vc.getLog10PError() / ((double)depth * indelNormalizationFactor(altAlleleLength, ref != null));
+
+ // Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);
- Map map = new HashMap<>();
+
+ final Map map = new HashMap<>();
map.put(getKeyNames().get(0), String.format("%.2f", QD));
return map;
}
+ /**
+ * Generate the indel normalization factor.
+ *
+ * @param altAlleleLength the average alternate allele length for the call
+ * @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length?
+ * @return a possitive double
+ */
+ private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) {
+ return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
+ }
+
/**
* The haplotype caller generates very high quality scores when multiple events are on the
* same haplotype. This causes some very good variants to have unusually high QD values,
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index 58c3bb9bd..7943eb09b 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
- final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249"));
+ final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("0c331915b07b42d726bc3d623aa9997b"));
specAnn.disableShadowBCF();
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);
@@ -384,10 +384,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
Assert.assertFalse(lineAnn == null);
final VariantContext vcAnn = codecAnn.decode(lineAnn);
- if( vc.isBiallelic() ) {
- Assert.assertTrue(vc.hasAttribute("QD"));
- Assert.assertTrue(vcAnn.hasAttribute("QD"));
- }
+ Assert.assertTrue(vc.hasAttribute("QD"));
+ Assert.assertTrue(vcAnn.hasAttribute("QD"));
}
Assert.assertFalse(lineIterator.hasNext());
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java
index 460b80121..5a16837f1 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java
@@ -69,16 +69,16 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
- executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "dac2d7969e109aee9ad2dad573759f58");
+ executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "0eec36459cf1f1e3e8739ab5b1cedb39");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
- executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "ceb105e3db0f2b993e3d725b0d60b6a3");
+ executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "73229442a8fe558e58dd5dd305eb2315");
}
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
- executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a");
+ executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "2787064918c7b391071a6ad4e5b0aba8");
}
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java
index 48f36ccc6..4c8c12887 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java
@@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
- executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde");
+ executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","50ebb7f74e5618acdd014dd87f2363fc");
}
@Test(enabled = true)
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java
index 6219eb578..deb0289c9 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java
@@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
- Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92"));
+ Arrays.asList("1ad3943ae27a0062c52a19abe1c0d32c"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
- Arrays.asList("0cbe889e03bab6512680ecaebd52c536"));
+ Arrays.asList("9b4ead3da021763704fcb9d80a5ee6ff"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
- Arrays.asList("e10c49fcf9a128745c2b050a52798e58"));
+ Arrays.asList("8a0a751afdb2a8166432d9822e4d814c"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
- Arrays.asList("475f8148123792064130faf9f9030fec"));
+ Arrays.asList("422a114943a9e3e9bf5872b82cbc6340"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
- Arrays.asList("a7e4e1bd128424d46cffdd538b220074"));
+ Arrays.asList("01fec03933816e8d82aabe6e5b276dd5"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
- Arrays.asList("903af514f70db9238064da311c4ea0de"));
+ Arrays.asList("e3c95f745ebf2d4f26759878966c5280"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
- Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3"));
+ Arrays.asList("4a45d5bd459565ec35c726894430e8df"));
executeTest("test minIndelFraction 0.0", spec);
}
@@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
- Arrays.asList("a5b6d7b32953500d936d3dff512a6254"));
+ Arrays.asList("a78c663eff00b28b44f368f03b2acf1b"));
executeTest("test minIndelFraction 0.25", spec);
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index dcaed8bf2..b30d124c4 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
- testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" );
+ testHeterozosity( 0.01, "6053106407e09a6aefb78395a0e22ec4" );
}
@Test
public void testHeterozyosity2() {
- testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" );
+ testHeterozosity( 1.0 / 1850, "37666375278259c4d7dc800a0f73c1ca" );
}
private void testHeterozosity(final double arg, final String md5) {
@@ -203,7 +203,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
- private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207";
+ private final static String COMPRESSED_OUTPUT_MD5 = "c5c6af421cffa12fe6bdaced6cd41dd2";
@Test
public void testCompressedOutput() {
@@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
- Arrays.asList("c4248f02103e37e89b0f22c0d9c98492"));
+ Arrays.asList("630d1dcfb7650a9287d6723c38b0746a"));
executeTest(String.format("test multiple technologies"), spec);
}
@@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
- Arrays.asList("96c7862d55e933b274cabe45c9c443d9"));
+ Arrays.asList("976e88e4accb4436ad9ac97df9477648"));
executeTest(String.format("test calling with BAQ"), spec);
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java
index 01aab8ae3..47ef49845 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java
@@ -88,7 +88,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
- Arrays.asList("02b521fe88a6606a29c12c0885c3debd"));
+ Arrays.asList("75503fce7521378f8c2170094aff29df"));
executeTest("test SingleSample Pilot2", spec);
}
@@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
- Arrays.asList("a973298b2801b80057bea88507e2858d"));
+ Arrays.asList("02c7804c8013ba1ead8e02b956b5e454"));
executeTest("test reverse trim", spec);
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
index 3b5690046..0a54acbe4 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java
@@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBamINDELs() {
- testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6");
+ testReducedCalling("INDEL", "d593628b2bc144e987a9e75e5eee0001");
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
index 3907ffbd6..5769c3a51 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
@@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
- HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88c10027c21712b1fe475c06cadd503c");
+ HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "ff19ae39b0695680ea670d53f6f9ce47");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
index 97744f126..3f6151c71 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
@@ -65,9 +65,9 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
- tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"});
- tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"});
- tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"});
+ tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "96328c91cf9b06de231b37a22a7a7639"});
+ tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ac25e9a78b89655197513bb0eb7a6845"});
+ tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "dc0dde72131d562587acae967cf2031f"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"});
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index dfbbd7084..ba296f263 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -84,22 +84,22 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
- HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b");
+ HCTest(CEUTRIO_BAM, "", "f2ad35b5e0d181fb18da86a8971ce4f4");
}
@Test
public void testHaplotypeCallerSingleSample() {
- HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295");
+ HCTest(NA12878_BAM, "", "06abde3268336a7cdb21970f12e819ba");
}
@Test
public void testHaplotypeCallerGraphBasedSingleSample() {
- HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "213df0bdaa78a695e9336128333e4407");
+ HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "3d1cb9acdf66547f88ad1742e8178044");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
- HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ceee711cac50b4bb66a084acb9264941");
+ HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "af6f1f504ad771201aedc0157de8830a");
}
@Test(enabled = false) // can't annotate the rsID's yet
@@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
- "b09437f11db40abd49195110e50692c2");
+ "fd43de437bbaf960499f67daedc6ef63");
}
@Test
@@ -126,7 +126,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
- HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e");
+ HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a3bb5f0bcec603287520841c559638f");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@@ -173,7 +173,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
- final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
+ final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("170896ddcfe06ec47e08aefefd99cf78"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
- Arrays.asList("a43d6226a51eb525f0774f88e3778189"));
+ Arrays.asList("6ab05a77d2e79d21ba85fadf844a13ba"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@@ -261,7 +261,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
- Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f"));
+ Arrays.asList("903af86b396ce88a6c8e4f4016fbe769"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@@ -293,7 +293,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
- Arrays.asList("19c2992541ede7407192660fdc1fadbf"));
+ Arrays.asList("824188743703bc09225c5b9c6b404ac1"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@@ -301,7 +301,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
- Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16"));
+ Arrays.asList("14de866430f49c0026aafc1e34ed8250"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
}
From 8fcad6680b6907e4a5ac00ea60216b633fd27e7f Mon Sep 17 00:00:00 2001
From: Geraldine Van der Auwera
Date: Tue, 10 Dec 2013 13:19:53 -0500
Subject: [PATCH 015/113] Assorted fixes and improvements to gatkdocs
-Added docs for ERC mode in HC
-Move RecalibrationPerformance walker since to private since it is experimental and unsupported
-Updated VR docs and restored percentBad/numBad (but @Hidden) to enable deprecation alert if users try to use them
-Improved error msg for conflict between per-interval aggregation and -nt
-Minor clean up in exception docs
-Added Toy Walkers category for devs and dev supercat (to build out docs for developers)
-Added more detailed info to GenotypeConcordance doc based on Chris forum post
-Added system to include min/max argument values in gatkdocs (build gatkdocs with 'ant gatkdocs' to test it, see engine and DoC args for in situ examples)
-Added tentative min/max argument annotations to DepthOfCoverage and CommandLineGATK arguments (and improved docs while at it)
-Added gotoDev annotation to GATKDocumentedFeature to track who is the go-to person in GSA for questions & issues about specific walkers/tools (now discreetly indicated in each gatkdoc)
---
.../bqsr/RecalibrationPerformance.java | 141 ---------
.../haplotypecaller/HaplotypeCaller.java | 8 +-
.../VariantRecalibrator.java | 8 +-
...VariantRecalibratorArgumentCollection.java | 16 +
.../sting/commandline/CommandLineProgram.java | 31 +-
.../IntervalArgumentCollection.java | 49 ++-
.../sting/gatk/CommandLineGATK.java | 26 +-
.../arguments/GATKArgumentCollection.java | 295 +++++++++++-------
.../sting/gatk/executive/MicroScheduler.java | 2 +-
.../walkers/coverage/DepthOfCoverage.java | 89 +++---
.../sting/gatk/walkers/qc/ErrorThrowing.java | 5 +-
.../variantutils/GenotypeConcordance.java | 77 +++--
.../DynamicClassResolutionException.java | 4 -
.../sting/utils/exceptions/UserException.java | 4 -
.../utils/help/DocumentedGATKFeature.java | 4 +-
.../help/DocumentedGATKFeatureObject.java | 10 +-
.../sting/utils/help/GATKDoclet.java | 8 +-
.../help/GenericDocumentationHandler.java | 38 ++-
.../sting/utils/help/HelpConstants.java | 23 +-
settings/helpTemplates/common.html | 8 +-
.../helpTemplates/generic.index.template.html | 3 +-
settings/helpTemplates/generic.template.html | 103 +++---
22 files changed, 534 insertions(+), 418 deletions(-)
delete mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
deleted file mode 100644
index 271617059..000000000
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
-*/
-
-package org.broadinstitute.sting.gatk.walkers.bqsr;
-
-import org.broadinstitute.sting.commandline.*;
-import org.broadinstitute.sting.gatk.CommandLineGATK;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.filters.*;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.report.GATKReport;
-import org.broadinstitute.sting.gatk.report.GATKReportTable;
-import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.help.HelpConstants;
-import org.broadinstitute.sting.utils.recalibration.*;
-
-import java.io.*;
-
-/**
- * Evaluate the performance of the base recalibration process
- *
- *
This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.
- *
- *
Caveat
- *
This tool is currently experimental. We do not provide documentation nor support for its operation.
- *
- */
-@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
-@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
-@PartitionBy(PartitionType.READ)
-public class RecalibrationPerformance extends RodWalker implements NanoSchedulable {
-
- @Output
- public PrintStream out;
-
- @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file")
- public File RECAL_FILE = null;
-
- public void initialize() {
- out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors");
-
- final GATKReport report = new GATKReport(RECAL_FILE);
- final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
- for ( int row = 0; row < table.getNumRows(); row++ ) {
-
- final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
- final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)));
- final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
-
- final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME));
-
- final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported);
-
- //if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 )
- // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp));
-
- if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") &&
- table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") &&
- table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") &&
- (QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) {
- out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors));
- out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors));
- }
- }
-
- }
-
- @Override
- public boolean isDone() {
- return true;
- }
-
- private double asDouble(final Object o) {
- if ( o instanceof Double )
- return (Double)o;
- else if ( o instanceof Integer )
- return (Integer)o;
- else if ( o instanceof Long )
- return (Long)o;
- else
- throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass());
- }
-
- @Override
- public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
-
- @Override
- public Integer reduceInit() { return 0; }
-
- @Override
- public Integer reduce(Integer counter, Integer sum) { return 0; }
-
- @Override
- public void onTraversalDone(Integer sum) {}
-}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 82015d153..0bedf9062 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -280,8 +280,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// general advanced arguments to control haplotype caller behavior
// -----------------------------------------------------------------------------------------------
+ /**
+ * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference.
+ * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works.
+ * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!).
+ * This requirement is a temporary workaround for an issue with index compression.
+ */
@Advanced
- @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false)
+ @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting experimental reference confidence scores", required = false)
protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;
public enum ReferenceConfidenceMode {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
index d43dc4a12..c5e2b8183 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@@ -165,10 +165,10 @@ public class VariantRecalibrator extends RodWalker> resource = Collections.emptyList();
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
index b501655f8..81067e695 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
@@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
@@ -117,4 +118,19 @@ public class VariantRecalibratorArgumentCollection {
@Advanced
@Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false)
public double BAD_LOD_CUTOFF = -5.0;
+
+ /////////////////////////////
+ // Deprecated Arguments
+ // Keeping them here is meant to provide users with error messages that are more informative than "arg not defined" when they use an argument that has been put out of service
+ /////////////////////////////
+
+ @Hidden
+ @Deprecated
+ @Argument(fullName="percentBadVariants", shortName="percentBad", doc="This argument is no longer used in GATK versions 2.7 and newer. Please see the online documentation for the latest usage recommendations.", required=false)
+ public double PERCENT_BAD_VARIANTS = 0.03;
+
+ @Hidden
+ @Deprecated
+ @Argument(fullName="numBadVariants", shortName="numBad", doc="This argument is no longer used in GATK versions 2.8 and newer. Please see the online documentation for the latest usage recommendations.", required=false)
+ public int NUM_BAD_VARIANTS = 1000;
}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
index f00bd0ad6..8c7e11f35 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
@@ -43,26 +43,29 @@ public abstract class CommandLineProgram {
/** The command-line program and the arguments it returned. */
public ParsingEngine parser = null;
- /** the default log level */
- @Argument(fullName = "logging_level",
- shortName = "l",
- doc = "Set the minimum level of logging, i.e. setting INFO get's you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging.",
- required = false)
+ /**
+ * Setting INFO gets you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging, and so on.
+ */
+ @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false)
protected String logging_level = "INFO";
-
- /** where to send the output of our logger */
- @Output(fullName = "log_to_file",
- shortName = "log",
- doc = "Set the logging location",
- required = false)
+ /**
+ * File to save the logging output.
+ */
+ @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false)
protected String toFile = null;
- /** this is used to indicate if they've asked for help */
- @Argument(fullName = "help", shortName = "h", doc = "Generate this help message", required = false)
+ /**
+ * This will produce a help message in the terminal with general usage information, listing available arguments
+ * as well as tool-specific information if applicable.
+ */
+ @Argument(fullName = "help", shortName = "h", doc = "Generate the help message", required = false)
public Boolean help = false;
- /** This is used to indicate if they've asked for the version information */
+ /**
+ * Use this to check the version number of the GATK executable you are invoking. Note that the version number is
+ * always included in the output at the start of every run as well as any error message.
+ */
@Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false)
public Boolean version = false;
diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java
index b491c9f3d..d2a1735fb 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java
@@ -33,38 +33,53 @@ import java.util.List;
public class IntervalArgumentCollection {
/**
- * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
- * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
- * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
- * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
+ * Use this option to perform the analysis over only part of the genome. This argument can be specified multiple times.
+ * You can use samtools-style intervals either explicitly on the command line (e.g. -L chr1 or -L chr1:100-200) or
+ * by loading in a file containing a list of intervals (e.g. -L myFile.intervals).
+ *
+ * Additionally, you can also specify a ROD file (such as a VCF file) in order to perform the analysis at specific
+ * positions based on the records present in the file (e.g. -L file.vcf).
+ *
+ * Finally, you can also use this to perform the analysis on the reads that are completely unmapped in the BAM file
+ * (i.e. those without a reference contig) by specifying -L unmapped.
*/
- @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
+ @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate", required = false)
public List> intervals = null;
/**
- * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
- * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
- * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
- */
- @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
+ * Use this option to exclude certain parts of the genome from the analysis (like -L, but the opposite).
+ * This argument can be specified multiple times. You can use samtools-style intervals either explicitly on the
+ * command line (e.g. -XL chr1 or -XL chr1:100-200) or by loading in a file containing a list of intervals
+ * (e.g. -XL myFile.intervals).
+ *
+ * Additionally, you can also specify a ROD file (such as a VCF file) in order to exclude specific
+ * positions from the analysis based on the records present in the file (e.g. -L file.vcf).
+ * */
+ @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing", required = false)
public List> excludeIntervals = null;
/**
- * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
- * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
+ * By default, the program will take the UNION of all intervals specified using -L and/or -XL. However, you can
+ * change this setting, for example if you want to take the INTERSECTION of the sets instead. E.g. to perform the
+ * analysis on positions for which there is a record in a VCF, but restrict this to just those on chromosome 20,
+ * you would do -L chr20 -L file.vcf -isr INTERSECTION.
*/
- @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
+ @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Set merging approach to use for combining interval inputs", required = false)
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
- * Should abutting (but not overlapping) intervals be treated as separate intervals?
+ * By default, the program merges abutting intervals (i.e. intervals that are directly side-by-side but do not
+ * actually overlap) into a single continuous interval. However you can change this behavior if you want them to be
+ * treated as separate intervals instead.
*/
- @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
+ @Argument(fullName = "interval_merging", shortName = "im", doc = "Interval merging rule for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
/**
- * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
+ * Use this to add padding to the intervals specified using -L and/or -XL. For example, '-L chr1:100' with a
+ * padding value of 20 would turn into '-L chr1:80-120'. This is typically used to add padding around exons when
+ * analyzing exomes. The general Broad exome calling pipeline uses 100 bp padding by default.
*/
- @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
+ @Argument(fullName = "interval_padding", shortName = "ip", doc = "Amount of padding (in bp) to add to each interval", required = false, minValue = 0)
public int intervalPadding = 0;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index 5fc0ccd3e..728fee5c8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -44,15 +44,31 @@ import java.util.*;
/**
* All command line parameters accepted by all tools in the GATK.
*
- * The GATK engine itself. Manages map/reduce data access and runs walkers.
+ *
Info for general users
*
- * We run command line GATK programs using this class. It gets the command line args, parses them, and hands the
- * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here,
- * the gatk engine should deal with any data related information.
+ *
This is a list of options and parameters that are generally available to all tools in the GATK.
+ *
+ *
There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR
+ * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used
+ * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This
+ * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just
+ * skimming the one-line summaey in the table.
+ *
+ *
Info for developers
+ *
+ *
This class is the GATK engine itself, which manages map/reduce data access and runs walkers.
+ *
+ *
We run command line GATK programs using this class. It gets the command line args, parses them, and hands the
+ * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here;
+ * the GATK engine should deal with any data related information.
*/
@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE)
public class CommandLineGATK extends CommandLineExecutable {
- @Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run")
+ /**
+ * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses)
+ * is available in the online documentation.
+ */
+ @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run")
private String analysisName = null;
// our argument collection, the collection of command line args we accept
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 08f892f97..88b34090c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -50,19 +50,20 @@ import java.util.concurrent.TimeUnit;
*/
public class GATKArgumentCollection {
- /* our version number */
- private float versionNumber = 1;
- private String description = "GATK Arguments";
-
/** the constructor */
public GATKArgumentCollection() {
}
// parameters and their defaults
- @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
+ /**
+ * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a
+ * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or
+ * BAM file. Please see our online documentation for more details on input formatting requirements.
+ */
+ @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false)
public List samFiles = new ArrayList();
- @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
+ @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0)
public Integer readBufferSize = null;
// --------------------------------------------------------------------------------------------------------------
@@ -71,21 +72,30 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
- @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? AWS is the default, can be NO_ET so nothing is posted to the run repository. Please see " + UserException.PHONE_HOME_DOCS_URL + " for details.", required = false)
+ /**
+ * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic
+ * non-identifying statistics (which tool was used, whether the run was successful etc.) that help us for debugging
+ * and development. You can use this option to turn off reporting if your run environment is not connected to the
+ * internet or if your data is subject to stringent confidentiality clauses (e.g. clinical patient data).
+ * To do so you will need to request a key using the online request form on our website.
+ */
+ @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS;
-
- @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + UserException.PHONE_HOME_DOCS_URL + " for details.", required = false)
+ /**
+ * Please see the online documentation FAQs for more details on the key system and how to request a key.
+ */
+ @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false)
public File gatkKeyFile = null;
/**
- * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be
+ * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be
* used to group together runs during later analysis. One use of this capability is to tag runs as GATK
* performance tests, so that the performance of the GATK over time can be assessed from the logs directly.
*
* Note that the tags do not conform to any ontology, so you are free to use any tags that you might find
* meaningful.
*/
- @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false)
+ @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false)
public String tag = "NA";
// --------------------------------------------------------------------------------------------------------------
@@ -94,26 +104,48 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
- @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
- public List readFilters = new ArrayList();
+ /**
+ * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately,
+ * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool
+ * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter
+ * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not
+ * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this
+ * is specified in each tool's documentation. The default filters cannot be disabled.
+ */
+ @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false)
+ public final List readFilters = new ArrayList();
@ArgumentCollection
public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection();
-
+ /**
+ * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary
+ * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although
+ * this argument is indicated as being optional, almost all GATK tools require a reference in order to run.
+ * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs,
+ * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in
+ * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few
+ * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes.
+ */
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
-
- @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
+ /**
+ * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically.
+ */
+ @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false)
public boolean nonDeterministicRandomSeed = false;
-
+ /**
+ * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.
+ */
@Hidden
- @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.")
+ @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.")
public boolean disableDithering = false;
-
- @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
+ /**
+ * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument.
+ */
+ @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false, minValue = 0)
public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;
- @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="The TimeUnit for maxRuntime", required = false)
+ @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false)
public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES;
// --------------------------------------------------------------------------------------------------------------
@@ -122,32 +154,47 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
/**
- * Reads will be selected randomly to be removed from the pile based on the method described here.
+ * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis.
+ * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools
+ * specify a default downsampling type and target, but this behavior can be overridden from command line using the
+ * downsampling arguments.
*/
- @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus", required = false)
+ @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false)
public DownsampleType downsamplingType = null;
-
- @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
+ /**
+ * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of
+ * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling
+ * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of
+ * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target
+ * coverage you need to aim for in order to obtain enough coverage in all loci of interest.
+ */
+ @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0)
public Double downsampleFraction = null;
/**
- * For locus-based traversals (LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the
- * maximum depth of coverage at each locus. For read-based traversals (ReadWalkers), it controls the
- * maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use
- * much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does
- * not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the
- * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when
- * removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note
- * that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling
- * algorithm will under some circumstances retain slightly more or less coverage than requested.
+ * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to
+ * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes
+ * unreasonable computational costs. The downsampling process takes two different forms depending on the type of
+ * analysis it is used with.
+ *
+ * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller),
+ * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals
+ * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start
+ * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers
+ * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available
+ * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation
+ * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of
+ * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be
+ * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than
+ * requested.
*/
@Argument(fullName = "downsample_to_coverage", shortName = "dcov",
- doc = "Coverage [integer] to downsample to per locus (for locus walkers) or per alignment start position (for read walkers)",
- required = false)
+ doc = "Target coverage threshold for downsampling to coverage",
+ required = false, minValue = 0)
public Integer downsampleCoverage = null;
/**
- * Gets the downsampling method explicitly specified by the user. If the user didn't specify
+ * Gets the downsampling method explicitly specified by the user. If the user didn't specify
* a default downsampling mechanism, return the default.
* @return The explicitly specified downsampling mechanism, or the default if none exists.
*/
@@ -178,8 +225,10 @@ public class GATKArgumentCollection {
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false)
public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF;
-
- @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
+ /**
+ * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets.
+ */
+ @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0)
public double BAQGOP = BAQ.DEFAULT_GOP;
// --------------------------------------------------------------------------------------------------------------
@@ -189,19 +238,33 @@ public class GATKArgumentCollection {
// --------------------------------------------------------------------------------------------------------------
/**
- * Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at Q64. The idea here is
- * simple: we just iterate over all reads and subtract 31 from every quality score.
+ * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification.
+ * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the
+ * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should
+ * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are
+ * not in the correct encoding.
*/
@Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false)
public boolean FIX_MISENCODED_QUALS = false;
-
- @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Do not fail when encountering base qualities that are too high and that seemingly indicate a problem with the base quality encoding of the BAM file", required = false)
+ /**
+ * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly
+ * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know
+ * what you are doing; otherwise you could seriously mess up your data and ruin your analysis.
+ */
+ @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false)
public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false;
-
- @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
+ /**
+ * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
+ * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
+ * tag is present for a read, the standard qual score will be used.
+ */
+ @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false)
public Boolean useOriginalBaseQualities = false;
-
- @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
+ /**
+ * If reads are missing some or all base quality scores, this value will be used for all base quality scores.
+ * By default this is set to -1 to disable default base quality assignment.
+ */
+ @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE)
public byte defaultBaseQualities = -1;
// --------------------------------------------------------------------------------------------------------------
@@ -213,9 +276,9 @@ public class GATKArgumentCollection {
/**
* The file name for the GATK performance log output, or null if you don't want to generate the
* detailed performance logging table. This table is suitable for importing into R or any
- * other analysis software that can read tsv files
+ * other analysis software that can read tsv files.
*/
- @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
+ @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false)
public File performanceLog = null;
// --------------------------------------------------------------------------------------------------------------
@@ -225,10 +288,11 @@ public class GATKArgumentCollection {
// --------------------------------------------------------------------------------------------------------------
/**
- * Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool.
- * Please be aware that one should only run recalibration with the covariates file created on the same input bam(s).
+ * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads
+ * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool.
+ * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s).
*/
- @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration (intended for use with BaseRecalibrator and PrintReads)")
+ @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration")
public File BQSR_RECAL_FILE = null;
/**
@@ -243,36 +307,41 @@ public class GATKArgumentCollection {
public int quantizationLevels = 0;
/**
- * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
+ * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced.
*/
- @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "If true, disables printing of base insertion and base deletion tags (with -BQSR)", required=false)
+ @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false)
public boolean disableIndelQuals = false;
/**
- * By default, the OQ tag in not emitted when using the -BQSR argument.
+ * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file.
+ * Note that this may results in significant file size increase.
*/
- @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "If true, enables printing of the OQ tag with the original base qualities (with -BQSR)", required=false)
+ @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false)
public boolean emitOriginalQuals = false;
/**
- * Do not modify quality scores less than this value but rather just write them out unmodified in the recalibrated BAM file.
+ * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file.
* In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases.
* For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself,
* but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
* your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
*/
- @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)", required = false)
+ @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE)
public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE;
-
- @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "The global Qscore Bayesian prior to use in the BQSR. If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score", required = false)
+ /**
+ * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score.
+ */
+ @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false)
public double globalQScorePrior = -1.0;
/**
- * For the sake of your data, please only use this option if you know what you are doing. It is absolutely not recommended practice
- * to run base quality score recalibration on reduced BAM files.
+ * It is absolutely not recommended practice to run base quality score recalibration on BAM files that have been
+ * processed with ReduceReads. By default, the GATK will error out if it detects that you are trying to recalibrate
+ * a reduced BAM file. However, this flag allows you to disable the warning and proceed anyway. For the sake of your
+ * data, please only use this option if you really know what you are doing.
*/
@Advanced
- @Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Do not fail when running base quality score recalibration on a reduced BAM file even though we highly recommend against it", required = false)
+ @Argument(fullName = "allow_bqsr_on_reduced_bams_despite_repeated_warnings", shortName="allowBqsrOnReducedBams", doc="Ignore all warnings about how it's a really bad idea to run BQSR on a reduced BAM file (AT YOUR OWN RISK!)", required = false)
public boolean ALLOW_BQSR_ON_REDUCED_BAMS = false;
// --------------------------------------------------------------------------------------------------------------
@@ -281,35 +350,45 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
+ /**
+ * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong.
+ */
@Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false)
public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT;
-
- @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we override the Walker's default and remove program records from the SAM header", required = false)
+ /**
+ * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header.
+ */
+ @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false)
public boolean removeProgramRecords = false;
-
- @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false)
+ /**
+ * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header.
+ */
+ @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false)
public boolean keepProgramRecords = false;
-
+ /**
+ * This option requires that each BAM file listed in the mapping file have only a single sample specified in its header
+ * (though there may be multiple read groups for that sample). Each line of the mapping file must contain the absolute
+ * path to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.
+ */
@Advanced
- @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file",
- doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " +
- "each BAM file listed in the mapping file have only a single sample specified in its header (though there " +
- "may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " +
- "to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.",
- required = false)
+ @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false)
public File sampleRenameMappingFile = null;
-
- @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
+ /**
+ * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong.
+ */
+ @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false)
public ValidationExclusion.TYPE unsafe;
-
+ /**
+ * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking
+ * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index
+ * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it
+ * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general
+ * because it allows reading from index files without first acquiring a lock.
+ */
@Hidden
@Advanced
@Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods",
- doc = "UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking " +
- "when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index " +
- "generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it " +
- "for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general " +
- "because it allows reading from index files without first acquiring a lock.",
+ doc = "Disable both auto-generation of index files and index file locking",
required = false)
public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false;
@@ -320,23 +399,22 @@ public class GATKArgumentCollection {
// --------------------------------------------------------------------------------------------------------------
/**
- * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per
- * data thread, and act as completely data parallel processing, increasing the memory usage of GATK
- * by M data threads. Data threads generally scale extremely effectively, up to 24 cores
+ * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing
+ * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores.
+ * See online documentation FAQs for more information.
*/
- @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
+ @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1)
public Integer numberOfDataThreads = 1;
/**
- * How many CPU threads should be allocated per data thread? Each CPU thread operates the map
- * cycle independently, but may run into earlier scaling problems with IO than data threads. Has
- * the benefit of not requiring X times as much memory per thread as data threads do, but rather
- * only a constant overhead.
+ * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than
+ * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather
+ * only a constant overhead. See online documentation FAQs for more information.
*/
- @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
+ @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1)
public int numberOfCPUThreadsPerDataThread = 1;
- @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
+ @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0)
@Hidden
public int numberOfIOThreads = 0;
@@ -345,13 +423,15 @@ public class GATKArgumentCollection {
* cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for
* debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct.
*/
- @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false)
+ @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false)
public Boolean monitorThreadEfficiency = false;
- @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
+ @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1)
public Integer numberOfBAMFileHandles = null;
-
- @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false)
+ /**
+ * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line.
+ */
+ @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false)
public List readGroupBlackList = null;
// --------------------------------------------------------------------------------------------------------------
@@ -433,7 +513,7 @@ public class GATKArgumentCollection {
/**
* How strict should we be in parsing the PED files?
*/
- @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
+ @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false)
public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
// --------------------------------------------------------------------------------------------------------------
@@ -441,8 +521,10 @@ public class GATKArgumentCollection {
// BAM indexing and sharding arguments
//
// --------------------------------------------------------------------------------------------------------------
-
- @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false)
+ /**
+ * NO INTEGRATION TESTS are available. Use at your own risk.
+ */
+ @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false)
@Hidden
public boolean allowIntervalsWithUnindexedBAM = false;
@@ -451,8 +533,10 @@ public class GATKArgumentCollection {
// testing BCF2
//
// --------------------------------------------------------------------------------------------------------------
-
- @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes",required=false)
+ /**
+ * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes.
+ */
+ @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false)
@Hidden
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@@ -471,12 +555,13 @@ public class GATKArgumentCollection {
* DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
* DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
*/
-
- @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="which type of IndexCreator to use for VCF/BCF indices",required=false)
+ @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false)
@Advanced
public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE;
-
- @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator",required=false)
+ /**
+ * This is either the bin width or the number of features per bin, depending on the indexing strategy
+ */
+ @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false)
@Advanced
public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index 7077db49c..405c07392 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -147,7 +147,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
if ( threadAllocation.getNumDataThreads() > 1 ) {
if (walker.isReduceByInterval())
- throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
+ throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass())));
if ( ! (walker instanceof TreeReducible) ) {
throw badNT("nt", engine, walker);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
index ca3255097..3a51a9a6a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
@@ -57,10 +57,10 @@ import java.io.PrintStream;
import java.util.*;
/**
- * Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
+ * Assess sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
*
*
- * Coverage processes a set of bam files to determine coverage at different levels of partitioning and
+ * This tool processes a set of bam files to determine coverage at different levels of partitioning and
* aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by
* sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles,
* and/or percentage of bases covered to or beyond a threshold.
@@ -73,7 +73,7 @@ import java.util.*;
*
*(Optional) A REFSEQ Rod to aggregate coverage to the gene level
*
- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
+ * (for information about creating the REFSEQ Rod, please consult the online documentation)
*
*
Output
*
@@ -117,7 +117,7 @@ import java.util.*;
// todo -- alter logarithmic scaling to spread out bins more
// todo -- allow for user to set linear binning (default is logarithmic)
// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
-@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.MC)
@By(DataSource.REFERENCE)
@PartitionBy(PartitionType.NONE)
@Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE)
@@ -125,53 +125,63 @@ public class DepthOfCoverage extends LocusWalker
+ *
+ *
Moltenized tables
+ *
+ *
These tables may be optionally moltenized via the -moltenize argument. That is, the standard table
*
*
*
- * These tables are constructed on a per-sample basis, and include counts of eval vs comp genotype states, and the
- * number of times the alternate alleles between the eval and comp sample did not match up.
- *
- * In addition, Genotype Concordance produces site-level allelic concordance. For strictly bi-allelic VCFs,
- * only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated, but where multi-allelic sites are involved
- * counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated.
- *
+ *
+ * For strictly bi-allelic VCFs, only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated,
+ * but where multi-allelic sites are involved counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated.
+ *
+ *
* For example, in the following situation
*
* eval: ref - A alt - C
* comp: ref - A alt - C,T
*
* then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH.
- * However, in the case where eval has both C and T alternate alleles, both must be observed in the genotypes
+ * However, in the case where EVAL has both C and T alternate alleles, both must be observed in the genotypes
* (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If
- * one of the alleles has no observations in the genotype fields of the eval, the site-level concordance is
+ * one of the alleles has no observations in the genotype fields of the EVAL, the site-level concordance is
* tabulated as though that allele were not present in the record.
+ *
*
- *
Monomorphic Records
+ *
Monomorphic Records
+ *
* A site which has an alternate allele, but which is monomorphic in samples, is treated as not having been
- * discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the comp VCF), or not at all
- * (if no record exists in the comp VCF).
- *
+ * discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the COMP set), or not at all
+ * (if no record exists in the COMP set).
+ *
+ *
* That is, in the situation
*
* eval: ref - A alt - C genotypes - 0/0 0/0 0/0 ... 0/0
@@ -121,14 +150,18 @@ import java.util.*;
* eval: ref - A alt - . genotypes - 0/0 0/0 0/0 ... 0/0
* comp: ref - A alt - C ... 0/0 0/0 ...
*
- *
- * When a record is present in the comp VCF the *genotypes* for the monomorphic site will still be used to evaluate
+ *
+ *
+ * When a record is present in the COMP set the *genotypes* for the monomorphic site will still be used to evaluate
* per-sample genotype concordance counts.
+ *
*
- *
Filtered Records
+ *
Filtered Records
* Filtered records are treated as though they were not present in the VCF, unless -ignoreSiteFilters is provided,
* in which case all records are used. There is currently no way to assess concordance metrics on filtered sites
* exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them.
+ *
+
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> {
diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java
index 4d280423e..0f1b473c3 100644
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/DynamicClassResolutionException.java
@@ -29,10 +29,6 @@ import java.lang.reflect.InvocationTargetException;
/**
* Class for handling common failures of dynamic class resolution
- *
- * User: depristo
- * Date: Sep 3, 2010
- * Time: 2:24:09 PM
*/
public class DynamicClassResolutionException extends UserException {
public DynamicClassResolutionException(Class c, Exception ex) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
index 40a730029..4db6e3d69 100644
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@@ -42,10 +42,6 @@ import java.io.File;
* Represents the common user errors detected by Sting / GATK
*
* Root class for all GATK user errors, as well as the container for errors themselves
- *
- * User: depristo
- * Date: Sep 3, 2010
- * Time: 2:24:09 PM
*/
@DocumentedGATKFeature(
groupName = HelpConstants.DOCS_CAT_USRERR,
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java
index 0390e32d7..0afcdae02 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java
@@ -37,7 +37,7 @@ import java.lang.annotation.*;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface DocumentedGATKFeature {
- /** Should we actually document this feature, even through it's annotated? */
+ /** Should we actually document this feature, even though it's annotated? */
public boolean enable() default true;
/** The overall group name (walkers, readfilters) this feature is associated with */
public String groupName();
@@ -45,4 +45,6 @@ public @interface DocumentedGATKFeature {
public String summary() default "";
/** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */
public Class[] extraDocs() default {};
+ /** Who is the go-to developer for operation/documentation issues? */
+ public String gotoDev() default "NA";
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java
index 7d6819f39..ad0959bfe 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java
@@ -36,19 +36,20 @@ class DocumentedGATKFeatureObject {
private final Class classToDoc;
/** Are we enabled? */
private final boolean enable;
- private final String groupName, summary;
+ private final String groupName, summary, gotoDev;
private final Class[] extraDocs;
- public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs) {
+ public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs, final String gotoDev) {
this.classToDoc = classToDoc;
this.enable = enable;
this.groupName = groupName;
this.summary = summary;
this.extraDocs = extraDocs;
+ this.gotoDev = gotoDev;
}
- public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary) {
- this(classToDoc, true, groupName, summary, new Class[]{});
+ public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary, final String gotoDev) {
+ this(classToDoc, true, groupName, summary, new Class[]{}, gotoDev);
}
public Class getClassToDoc() { return classToDoc; }
@@ -56,4 +57,5 @@ class DocumentedGATKFeatureObject {
public String groupName() { return groupName; }
public String summary() { return summary; }
public Class[] extraDocs() { return extraDocs; }
+ public String gotoDev() { return gotoDev; }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java
index 63cb0900a..6468fe51d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java
@@ -118,7 +118,8 @@ public class GATKDoclet {
static {
STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class,
HelpConstants.DOCS_CAT_RODCODECS,
- "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED"));
+ "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED",
+ "NA"));
}
@@ -332,11 +333,11 @@ public class GATKDoclet {
if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) {
DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class);
- return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs());
+ return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs(), f.gotoDev());
} else {
for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) {
if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) {
- return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs());
+ return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs(), staticDocs.gotoDev());
}
}
return null;
@@ -446,6 +447,7 @@ public class GATKDoclet {
if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools";
else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities";
else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine";
+ else if (annotation.groupName().endsWith(" (DevZone)")) supercatValue = "dev";
else supercatValue = "other";
root.put("supercat", supercatValue);
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
index 893a8349b..06c0e1c26 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
@@ -123,6 +123,8 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
for (Tag tag : toProcess.classDoc.tags()) {
root.put(tag.name(), tag.text());
}
+
+ root.put("gotoDev", toProcess.annotation.gotoDev());
}
/**
@@ -160,17 +162,29 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
try {
// loop over all of the arguments according to the parsing engine
for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) {
- // todo -- why can you have multiple ones?
ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0);
FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName());
Map argBindings = docForArgument(fieldDoc, argumentSource, argDef);
if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) {
final String kind = docKindOfArg(argumentSource);
-
+ // Retrieve default value
final Object value = argumentValue(toProcess.clazz, argumentSource);
if (value != null)
argBindings.put("defaultValue", prettyPrintValueString(value));
-
+ // Retrieve min and max / hard and soft value thresholds for numeric args
+ if (value instanceof Number) {
+ if (argumentSource.field.isAnnotationPresent(Argument.class)) {
+ argBindings.put("minValue", argumentSource.field.getAnnotation(Argument.class).minValue());
+ argBindings.put("maxValue", argumentSource.field.getAnnotation(Argument.class).maxValue());
+ if (argumentSource.field.getAnnotation(Argument.class).minRecommendedValue() != Double.NEGATIVE_INFINITY) {
+ argBindings.put("minRecValue", argumentSource.field.getAnnotation(Argument.class).minRecommendedValue());
+ }
+ if (argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue() != Double.POSITIVE_INFINITY) {
+ argBindings.put("maxRecValue", argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue());
+ }
+ }
+ }
+ // Finalize argument bindings
args.get(kind).add(argBindings);
args.get("all").add(argBindings);
}
@@ -742,8 +756,14 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
/**
* Returns a Pair of (main, synonym) names for argument with fullName s1 and
- * shortName s2. The main is selected to be the longest of the two, provided
- * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken.
+ * shortName s2.
+ *
+ * Previously we had it so the main name was selected to be the longest of the two, provided
+ * it didn't exceed MAX_DISPLAY_NAME, in which case the shorter was taken. But we now disable
+ * the length-based name rearrangement in order to maintain consistency in the GATKDocs table.
+ *
+ * This may cause messed up spacing in the CLI-help display but we don't care as much about that
+ * since more users use the online GATKDocs for looking up arguments.
*
* @param s1 the short argument name without -, or null if not provided
* @param s2 the long argument name without --, or null if not provided
@@ -758,13 +778,7 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
if (s1 == null) return new Pair(s2, null);
if (s2 == null) return new Pair(s1, null);
- String l = s1.length() > s2.length() ? s1 : s2;
- String s = s1.length() > s2.length() ? s2 : s1;
-
- if (l.length() > MAX_DISPLAY_NAME)
- return new Pair(s, l);
- else
- return new Pair(l, s);
+ return new Pair(s2, s1);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java
index 2ed35d848..783e7aa90 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java
@@ -50,15 +50,34 @@ public class HelpConstants {
public final static String DOCS_CAT_RF = "Read Filters";
public final static String DOCS_CAT_REFUTILS = "Reference Utilities";
public final static String DOCS_CAT_RODCODECS = "ROD Codecs";
- public final static String DOCS_CAT_USRERR = "User Exceptions";
+ public final static String DOCS_CAT_USRERR = "User Exceptions (DevZone)";
public final static String DOCS_CAT_VALIDATION = "Validation Utilities";
public final static String DOCS_CAT_ANNOT = "Variant Annotations";
public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools";
public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools";
- public final static String DOCS_CAT_TEST = "Testing Tools";
+ public final static String DOCS_CAT_TOY = "Toy Walkers (DevZone)";
public final static String DOCS_CAT_HELPUTILS = "Help Utilities";
public static String forumPost(String post) {
return GATK_FORUM_URL + post;
}
+
+ /**
+ * Go-to developer name codes for tracking and display purposes. Only current team members should be in this list.
+ * When someone leaves, their charges should be redistributed. The actual string should be closest to the dev's
+ * abbreviated name or two/three-letter nickname as possible. The code can be something else if necessary to
+ * disambiguate from other variable.
+ */
+ public final static String MC = "MC"; // Mauricio Carneiro
+ public final static String EB = "EB"; // Eric Banks
+ public final static String RP = "RP"; // Ryan Poplin
+ public final static String GVDA = "GG"; // Geraldine Van der Auwera
+ public final static String VRR = "VRR"; // Valentin Ruano-Rubio
+ public final static String ALM = "ALM"; // Ami Levy-Moonshine
+ public final static String BH = "BH"; // Bertrand Haas
+ public final static String JoT = "JT"; // Joel Thibault
+ public final static String DR = "DR"; // David Roazen
+ public final static String KS = "KS"; // Khalid Shakir
+
+
}
\ No newline at end of file
diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html
index f4fb74af1..ff9df5eea 100644
--- a/settings/helpTemplates/common.html
+++ b/settings/helpTemplates/common.html
@@ -86,7 +86,13 @@
Support Forum
-
GATK version ${version} built at ${timestamp}.
+
GATK version ${version} built at ${timestamp}.
+ <#-- closing P tag in next macro -->
+ #macro>
+
+ <#macro footerClose>
+ <#-- ugly little hack to enable adding tool-specific info inline -->
+
- ${arg.summary}. ${arg.fulltext}
- <#if arg.rodTypes??>${arg.name} binds reference ordered data. This argument supports ROD files of the
- following types: ${arg.rodTypes}
- #if>
- <#if arg.options??>
-
- The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values:
-
This table summarizes the command-line arguments that are specific to this tool. For details, see the list further down below the table.
+
This table summarizes the command-line arguments that are specific to this tool. For more details on each argument, see the list further down below the table or click on an argument name to jump directly to that entry in the list.
-
Name
-
Type
+
Argument name(s)
+
Default value
Summary
@@ -267,6 +293,11 @@
<@argumentDetails arg=arg/>
#list>
#if>
-
+
<@footerInfo />
+ <#-- Specify go-to developer (for internal use) -->
+ <#if gotoDev??>
+ GTD: ${gotoDev}
+ #if>
+ <@footerClose />
<@pageFooter />
\ No newline at end of file
From bdb3954eb37da87266a6d2fc9a531efeadd0e659 Mon Sep 17 00:00:00 2001
From: Geraldine Van der Auwera
Date: Mon, 13 Jan 2014 20:45:43 -0500
Subject: [PATCH 016/113] removed maxRuntime minValue
---
.../sting/gatk/arguments/GATKArgumentCollection.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 88b34090c..2bbc5482b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -142,7 +142,7 @@ public class GATKArgumentCollection {
/**
* This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument.
*/
- @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false, minValue = 0)
+ @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false)
public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;
@Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false)
From fd511d12a2143d83cdb931bad984a8b28d7a0254 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Mon, 13 Jan 2014 22:47:43 -0500
Subject: [PATCH 017/113] Fixing the Haplotype Resolver so that it doesn't
complain about missing header lines.
The code comments very clearly state that INFO fields shouldn't be propagated into the output,
but someone must have accidentally changed it afterwards. This is just a simple one-line fix
to make sure the code adhered to the comments.
Delivers #63333488.
---
.../sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java
index 01ab421b3..cfd07da67 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java
@@ -258,7 +258,7 @@ public class HaplotypeResolver extends RodWalker {
}
private void writeOne(final VariantContext vc, final String set, final String status) {
- final Map attrs = new HashMap(vc.getAttributes());
+ final Map attrs = new HashMap<>();
if ( SET_KEY != null && set != null )
attrs.put(SET_KEY, set);
if ( STATUS_KEY != null && status != null )
From edf58800222a92dcf30d07a98b8ab6264005c3e2 Mon Sep 17 00:00:00 2001
From: Geraldine Van der Auwera
Date: Fri, 10 Jan 2014 18:21:56 -0500
Subject: [PATCH 018/113] Updated SAMPileup codec and pileup-related docs
Problem: the codec was written to take in consensus pileups produced with pileup -c option (which consists of 10 or 13 fields per line depending on the variant type) but errored out on the basic pileup format (which only has 6 fields per line). This was inconsistent and confusing to users.
Solution: I added a switch in the parsing to recognize and handle both cases more appropriately, and updated related docs. While I was at it I also improved error messages in CheckPileup, which now emits User Error: Bad Input exceptions when reporting mismatches. Which may not be the best thing to do (ultimately they're not really errors, they're just reporting unwelcome results) but it beats emitting Runtime Exceptions.
Tested by CheckPileupIntegrationTest which tests both format cases.
---
.../sting/gatk/walkers/qc/CheckPileup.java | 105 +++++++-
.../sting/gatk/walkers/qc/Pileup.java | 52 +++-
.../codecs/sampileup/SAMPileupCodec.java | 229 ++++++++++++------
.../codecs/sampileup/SAMPileupFeature.java | 4 +
.../qc/CheckPileupIntegrationTest.java | 28 ++-
5 files changed, 318 insertions(+), 100 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
index 533c7be73..b6a3853f8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java
@@ -49,20 +49,109 @@ import java.io.PrintStream;
import java.util.Arrays;
/**
- * At every locus in the input set, compares the pileup data (reference base, aligned base from
- * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data
- * should be specified using the command-line argument '-pileup:SAMPileup '.
+ * Compare GATK's internal pileup to a reference Samtools pileup
+ *
+ *
At every locus in the input set, compares the pileup data (reference base, aligned base from
+ * each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated
+ * by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly
+ * different output format by default.
+ *
+ *
+ *
Format
+ *
There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
+ * 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.
+ *
Simple pileup: 6-column format
+ *
+ * Each line consists of chromosome, 1-based coordinate, reference base, the
+ * number of reads covering the site, read bases and base qualities. At the
+ * read base column, a dot stands for a match to the reference base on the
+ * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
+ * on the forward strand and `acgtn' for a mismatch on the reverse strand.
+ * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
+ * this reference position and the next reference position. The length of the
+ * insertion is given by the integer in the pattern, followed by the inserted sequence.
+ *
+ *
+ * seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ * seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ * seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6
+ * seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<<
+ * seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
+ * seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
+ * seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
+ * seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ *
The "consensus" or extended pileup consists of the following:
+ *
+ *
original 6 columns as described above
+ *
4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
+ * reads covering the sites) for all sites, inserted before the bases and quality strings
+ *
3 extra columns indicating counts of reads supporting indels (just for indel sites)
+ *
+ *
+ *
Example of consensus pileup for SNP or non-variant sites
+ *
+ * seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
+ * seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
+ * seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
+ * seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
+ * seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
+ * seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
+ * seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
+ * seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
+ * seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@Requires(value={DataSource.READS,DataSource.REFERENCE})
public class CheckPileup extends LocusWalker implements TreeReducible {
- @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true)
+ /**
+ * This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval.
+ */
+ @Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true)
RodBinding pileup;
@Output
private PrintStream out;
-
- @Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
+ /**
+ * By default the program will quit if it encounters an error (such as missing truth data for a given position).
+ * Use this flag to override the default behavior; the program will then simply print an error message and move on
+ * to the next position.
+ */
+ @Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false)
public boolean CONTINUE_AFTER_AN_ERROR = false;
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@@ -72,7 +161,7 @@ public class CheckPileup extends LocusWalker implemen
if ( truePileup == null ) {
out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar()));
if ( ! CONTINUE_AFTER_AN_ERROR ) {
- throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
+ throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
context.getLocation(), new String(pileup.getBases())));
}
} else {
@@ -80,7 +169,7 @@ public class CheckPileup extends LocusWalker implemen
if ( pileupDiff != null ) {
out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString());
if ( ! CONTINUE_AFTER_AN_ERROR ) {
- throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
+ throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff));
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
index 23bbf1460..48e21fdd0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java
@@ -48,11 +48,17 @@ import java.util.List;
/**
* Emulates the samtools pileup command to print aligned reads
*
- *
Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
- * consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
+ *
Prints the alignment in something similar to the Samtools pileup format (see the
+ * Pileup format documentation for more details about
+ * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference
+ * base, read bases, and read qualities. In addition to these default fields, additional information can be added to
+ * the output as extra columns; see options detailed below.
+ * chr1 257 A CAA '&=
+ * chr1 258 C TCC A:=
+ * chr1 259 C CCC )A=
+ * chr1 260 C ACC (=<
+ * chr1 261 T TCT '44
+ * chr1 262 A AAA '?:
+ * chr1 263 A AGA 1'6
+ * chr1 264 C TCC 987
+ * chr1 265 C CCC (@(
+ * chr1 266 C GCC ''=
+ * chr1 267 T AAT 7%>
+ *
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@@ -83,18 +104,25 @@ public class Pileup extends LocusWalker implements TreeReducibl
PrintStream out;
/**
- * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
+ * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
* and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per
* read items are delimited with an '@' character.
*/
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false)
public boolean SHOW_VERBOSE = false;
-
- @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
+ /**
+ * This enables annotating the pileup to show overlaps with metadata from a ROD file.
+ * For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup
+ * output at that position will be annotated with the corresponding source ROD identifier.
+ */
+ @Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false)
public List> rods = Collections.emptyList();
-
+ /**
+ * Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert
+ * produced during library generation before sequencing.
+ */
@Hidden
- @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false)
+ @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false)
public boolean outputInsertLength=false;
@Override
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
index 34705c4c9..70241a6c4 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
@@ -37,13 +37,21 @@ import java.util.regex.Pattern;
import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;
/**
- * Decoder for SAM pileup data. For GATK validation purposes only
+ * Decoder for SAM pileup data.
*
*
- * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
- * It desribes the base-pair information at each chromosomal position. This format
- * facilitates SNP/indel calling and brief alignment viewing by eyes.
+ * From the SAMTools project documentation:
*
+ *
The Pileup format was first used by Tony Cox and Zemin Ning at
+ * the Sanger Institute. It describes the base-pair information at each chromosomal position. This format
+ * facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced
+ * in Samtools by mpileup, which produces a slightly different output format by default.
+ *
+
+ *
Format
+ *
There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
+ * 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated.
+ *
Simple pileup: 6-column format
*
* Each line consists of chromosome, 1-based coordinate, reference base, the
* number of reads covering the site, read bases and base qualities. At the
@@ -54,13 +62,6 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
* this reference position and the next reference position. The length of the
* insertion is given by the integer in the pattern, followed by the inserted sequence.
*
The "consensus" or extended pileup consists of the following:
+ *
+ *
original 6 columns as described above
+ *
4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
+ * reads covering the sites) for all sites, inserted before the bases and quality strings
+ *
3 extra columns indicating counts of reads supporting indels (just for indel sites)
+ *
+ *
+ *
Example of consensus pileup for SNP or non-variant sites
+ *
+ * seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
+ * seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
+ * seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
+ * seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
+ * seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
+ * seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
+ * seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
+ * seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
+ * seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
+ *
Handling of indels is questionable at the moment. Proceed with care.
+ *
+ *
+ * @author Matt Hanna, Geraldine VdAuwera
+ * @since 2014
*/
public class SAMPileupCodec extends AsciiFeatureCodec {
- // the number of tokens we expect to parse from a pileup line
- private static final int expectedTokenCount = 10;
+ // number of tokens expected (6 or 10 are valid, anything else is wrong)
+ private static final int basicTokenCount = 6;
+ private static final int consensusSNPTokenCount = 10;
+ private static final int consensusIndelTokenCount = 13;
private static final char fldDelim = '\t';
-
// allocate once and don't ever bother creating them again:
private static final String baseA = "A";
private static final String baseC = "C";
@@ -92,74 +133,110 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
}
public SAMPileupFeature decode(String line) {
-// 0 1 2 3 4 5 6 7
-//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
-//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
- String[] tokens = new String[expectedTokenCount];
+ //+1 because we want to know if we have more than the max
+ String[] tokens = new String[consensusIndelTokenCount+1];
// split the line
- int count = ParsingUtils.split(line,tokens,fldDelim);
-
- // check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
- if (count != expectedTokenCount)
- throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
- "(expected = " + expectedTokenCount + ", saw = " + count + " on " +
- "line = " + line + ")");
+ final int count = ParsingUtils.split(line,tokens,fldDelim);
SAMPileupFeature feature = new SAMPileupFeature();
+ /**
+ * Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation.
+ * The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at
+ * the end for indels. We are currently not making any use of the extra indel tokens.
+ *
+ * Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong.
+ */
+ final String observedString, bases, quals;
+
feature.setChr(tokens[0]);
feature.setStart(Integer.parseInt(tokens[1]));
- if(tokens[2].length() != 1)
+ if(tokens[2].length() != 1) {
throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
- feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
-
- String observedString = tokens[3].toUpperCase(); // field 3
- feature.setFWDAlleles(new ArrayList(2));
-
- feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
- feature.setVariantConfidence(Double.parseDouble(tokens[5]));
-
- if ( feature.getRef() == '*' ) {
- parseIndels(observedString,feature) ;
- if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
- else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
- } else {
- parseBasesAndQuals(feature,tokens[8],tokens[9]);
- // if the variant is a SNP or a reference base (i.e. no variant at all)
- if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
- feature.setRefBases(tokens[2].toUpperCase());
- feature.setEnd(feature.getStart());
-
- char ch = observedString.charAt(0);
-
- switch ( ch ) {
- case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
- case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
- case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
- case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
- case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
- case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
- case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
- case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
- case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
- case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
- }
- if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
- else {
- // we know that at least one allele is non-ref;
- // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
- // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
- feature.setVariantType(VariantType.SNP);
- if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
- feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
- feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
- ) feature.setNumNonRef(1);
- else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
- }
}
+ feature.setRef(tokens[2].charAt(0));
+ switch (count) {
+ case basicTokenCount:
+ bases = tokens[4];
+ quals = tokens[5];
+ // parsing is pretty straightforward for 6-col format
+ if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format
+ throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format");
+ } else {
+ parseBasesAndQuals(feature, bases, quals);
+ feature.setRefBases(tokens[2].toUpperCase());
+ feature.setEnd(feature.getStart());
+ }
+ break;
+ case consensusSNPTokenCount: // pileup called a SNP or a reference base
+ observedString = tokens[3].toUpperCase();
+ feature.setFWDAlleles(new ArrayList(2));
+ feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+ feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+ bases = tokens[8];
+ quals = tokens[9];
+ // confirm that we have a non-variant, not a mis-parsed indel
+ if ( feature.getRef() == '*' ) {
+ throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel");
+ }
+ // Parse the SNP or non-variant
+ parseBasesAndQuals(feature, bases, quals);
+ if ( observedString.length() != 1 ) {
+ throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString);
+ }
+ feature.setRefBases(tokens[2].toUpperCase());
+ feature.setEnd(feature.getStart());
+
+ char ch = observedString.charAt(0);
+
+ switch ( ch ) { // record alleles (decompose ambiguous base codes)
+ case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
+ case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
+ case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
+ case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
+ case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
+ case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
+ case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
+ case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
+ case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
+ case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
+ }
+ if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
+ else {
+ // we know that at least one allele is non-ref;
+ // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+ // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+ feature.setVariantType(VariantType.SNP);
+ if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
+ feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
+ feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
+ ) feature.setNumNonRef(1);
+ else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+ }
+ break;
+ case consensusIndelTokenCount:
+ observedString = tokens[3].toUpperCase();
+ feature.setFWDAlleles(new ArrayList(2));
+ feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+ feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+ // confirm that we have an indel, not a mis-parsed SNP or non-variant
+ if ( feature.getRef() != '*' ) {
+ throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'");
+ }
+ // Parse the indel
+ parseIndels(observedString,feature) ;
+ if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
+ else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!!
+ break;
+ default:
+ throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " +
+ "(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount +
+ " (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount +
+ " (consensus pileup for an indel); saw = " + count + " on line = " + line + ")");
+ }
return feature;
}
@@ -197,7 +274,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
else feature.setVariantType(VariantType.DELETION);
feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
break;
- default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
+ default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype);
}
feature.getFWDAlleles().add(varBases);
feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
@@ -224,7 +301,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
{
//System.out.printf("%s%n%s%n", bases, quals);
- // needs to convert the base string with it's . and , to the ref base
+ // needs to convert the base string with its . and , to the ref base
StringBuilder baseBuilder = new StringBuilder();
StringBuilder qualBuilder = new StringBuilder();
boolean done = false;
@@ -254,7 +331,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec {
Matcher match = regex.matcher(rest);
if ( ! match.matches() ) {
if ( feature.getRef() != '*' )
- throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
+ throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i);
done = true;
}
else {
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
index a6fd996fd..287363601 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java
@@ -33,6 +33,10 @@ import java.util.List;
/**
* A tribble feature representing a SAM pileup.
*
+ * Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will
+ * contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as
+ * input for analysis that requires that information.
+ *
* @author mhanna
* @version 0.1
*/
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
index 4d3741228..0971cb90b 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java
@@ -33,18 +33,38 @@ import java.util.Collections;
/**
* Run validating pileup across a set of core data as proof of the integrity of the GATK core.
*
- * @author mhanna
- * @version 0.1
+ * Tests both types of old-school pileup formats (basic and consensus).
+ *
+ * @author mhanna, vdauwera
+ * @version 0.2
*/
public class CheckPileupIntegrationTest extends WalkerTest {
+ /**
+ * This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels
+ */
@Test(enabled = true)
- public void testEcoliThreaded() {
+ public void testEcoliConsensusPileup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CheckPileup" +
" -I " + validationDataLocation + "MV1994.selected.bam" +
" -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
" --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" +
" -S SILENT -nt 8",0, Collections.emptyList());
- executeTest("testEcoliThreaded",spec);
+ executeTest("testEcoliConsensusPileup",spec);
+ }
+
+ /**
+ * This test runs on a basic pileup containing 6-column lines for all variants TODO
+ */
+ @Test
+ public void testEcoliBasicPileup() {
+ WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+ "-T CheckPileup" +
+ " -I " + validationDataLocation + "MV1994.selected.bam" +
+ " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
+ " --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" +
+ " -L Escherichia_coli_K12:1-49" +
+ " -S SILENT -nt 8",0, Collections.emptyList());
+ executeTest("testEcoliBasicPileup",spec);
}
}
From 9f1ab0087a1523f8a9b06602c092351d74fa110f Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Tue, 14 Jan 2014 12:56:54 -0500
Subject: [PATCH 019/113] Added in a check for what would be an empty allele
after trimming.
---
.../indels/PairHMMIndelErrorModel.java | 28 ++++++++++++++-----
.../PairHMMIndelErrorModelUnitTest.java | 17 ++++++++++-
2 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
index 0b0fa020e..318779cd2 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@@ -205,10 +205,21 @@ public class PairHMMIndelErrorModel {
}
}
- private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap,
- long startLocationInRefForHaplotypes,
- long stopLocationInRefForHaplotypes,
- final ReferenceContext ref){
+ /**
+ * Trims the haplotypes in the given map to the provided start/stop.
+ *
+ * @param haplotypeMap the input map
+ * @param startLocationInRefForHaplotypes the start location of the trim
+ * @param stopLocationInRefForHaplotypes the stop location of the trim
+ * @param ref the reference context (used for debugging only, so can be null)
+ * @return a non-null mapping corresponding to the trimmed version of the original;
+ * some elements may be lost if trimming cannot be performed on them (e.g. they fall outside of the region to keep)
+ */
+ protected static Map trimHaplotypes(final Map haplotypeMap,
+ long startLocationInRefForHaplotypes,
+ long stopLocationInRefForHaplotypes,
+ final ReferenceContext ref) {
+ if ( haplotypeMap == null ) throw new IllegalArgumentException("The input allele to haplotype map cannot be null");
final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>();
for (final Allele a: haplotypeMap.keySet()) {
@@ -225,10 +236,13 @@ public class PairHMMIndelErrorModel {
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
+ if ( indStart >= indStop )
+ continue;
- if (DEBUG)
- System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
- indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
+ // commented out here because we need to make this method static for unit testing
+ //if (DEBUG)
+ // System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
+ // indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
// get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map
final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java
index bbbef43d3..3480b6775 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java
@@ -50,9 +50,12 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@@ -138,4 +141,16 @@ public class PairHMMIndelErrorModelUnitTest extends BaseTest {
Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 13), true);
Assert.assertEquals(PairHMMIndelErrorModel.mustClipDownstream(read, 14), false);
}
-}
\ No newline at end of file
+
+ @Test
+ public void trimHaplotypesToNullAlleleTest() {
+ // we need a case where start and stop > haplotype coordinates
+ final int start = 100, stop = 100;
+ final Haplotype h = new Haplotype(new byte[]{(byte)'A'}, new UnvalidatingGenomeLoc("1", 0, 10, 10));
+ final Map input = new HashMap(1);
+ input.put(Allele.create("A"), h);
+
+ final Map output = PairHMMIndelErrorModel.trimHaplotypes(input, start, stop, null);
+ Assert.assertTrue(output.isEmpty());
+ }
+}
From de561345793a2fff10c6b8204ce3b7aa6b266f5e Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 9 Jan 2014 12:29:54 -0500
Subject: [PATCH 020/113] Fixed up and refactored what seems to be a useful
private tool to create simulated reads around a VCF.
It didn't completely work before (it was hard-coded for a particular long-lost data set) but it should work now.
Since I thought that it might prove useful to others, I moved it to protected and added integration tests.
GERALDINE: NEW TOOL ALERT!
---
.../SimulateReadsForVariants.java | 395 ++++++++++++++++++
...mulateReadsForVariantsIntegrationTest.java | 95 +++++
public/testdata/forSimulation.vcf | 8 +
public/testdata/forSimulation.vcf.idx | Bin 0 -> 2067 bytes
4 files changed, 498 insertions(+)
create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java
create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java
create mode 100644 public/testdata/forSimulation.vcf
create mode 100644 public/testdata/forSimulation.vcf.idx
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java
new file mode 100644
index 000000000..7054d78cd
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java
@@ -0,0 +1,395 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.simulatereads;
+
+import cern.jet.random.Poisson;
+import cern.jet.random.engine.MersenneTwister;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMProgramRecord;
+import net.sf.samtools.SAMReadGroupRecord;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.variant.vcf.*;
+import org.broadinstitute.variant.variantcontext.*;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.text.TextFormattingUtils;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
+
+import java.util.*;
+
+/**
+ * Generates simulated reads for variants
+ *
+ *
Given a set of variants, this tool will generate simulated reads that support the input variants.
+ *
+ *
Caveats
+ *
For practical reasons, only bi-allelic variants that are not too close to the ends of contigs (< 1/2 read length) are supported; all others will simply be ignored.
+ *
+ *
Input
+ *
A VCF file containing variants.
+ *
+ *
Output
+ *
A BAM file containing simulated sequence reads that support the input variants, with the requested error rate and coverage depth.
+ *
+ */
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.EB)
+
+@Reference(window=@Window(start=-200,stop=200))
+public class SimulateReadsForVariants extends RodWalker {
+
+ @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+ /**
+ * The simulated reads will be written to a BAM file.
+ */
+ @Output(doc="Reads corresponding to variants", required=true)
+ protected StingSAMFileWriter readWriter;
+ /**
+ * Use this argument to set the desired target read depth. See the readSamplingMode argument for options that
+ * determine whether coverage distribution will be exactly this value or an approximation.
+ */
+ @Argument(fullName="readDepth", shortName="DP", doc="Read depth to generate", required=false, minValue = 0, minRecommendedValue = 1, maxRecommendedValue = 1000, maxValue = Integer.MAX_VALUE)
+ public int readDepth = 20;
+ /**
+ * Errors will be generated at this rate in the simulated reads. Base qualities are therefore also assigned this value.
+ */
+ @Argument(fullName="errorRate", shortName="ER", doc="Base error rate (Phred-scaled)", required=false, minValue = 0, maxValue = Integer.MAX_VALUE)
+ public int phredErrorRate = 20;
+ /**
+ * All simulated reads will be exactly this length.
+ */
+ @Argument(fullName="readLength", shortName="RL", doc="Read lengths (bp)", required=false, minValue = 1, maxValue = Integer.MAX_VALUE)
+ public int readLength = 101;
+ /**
+ * The corresponding platform identifier will be specified in the simulated read group PL tag. This setting does not
+ * affect the properties of the simulated reads.
+ */
+ @Advanced
+ @Argument(fullName="rgPlatform", shortName="RGPL", doc="Sequencing platform", required=false)
+ public NGSPlatform rgPlatform = NGSPlatform.ILLUMINA;
+ /**
+ * This determines how read sampling is achieved, and affects the coverage distribution of simulated reads.
+ * CONSTANT sampling will produce uniform depth at all positions, while POISSON sampling will produce a
+ * distribution of coverages around the requested value.
+ */
+ @Advanced
+ @Argument(fullName="readSamplingMode", shortName="RSM", doc="Sampling mode", required=false)
+ public ReadSamplingMode samplingMode = ReadSamplingMode.CONSTANT;
+ public enum ReadSamplingMode { CONSTANT, POISSON };
+
+ @Hidden
+ @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags, for integration tests", required=false)
+ public boolean NO_PG_TAG = false;
+
+ @Hidden
+ @Argument(fullName="verbose", shortName="verbose", doc="Verbose", required=false)
+ public boolean verbose = false;
+
+ public static final String PROGRAM_RECORD_NAME = "GATK SimulateReadsForVariants";
+
+ // variables used to store state
+ private long readNameCounter = 1;
+ private int halfReadLength;
+ private double errorRate;
+ private byte[] readQuals;
+ private SAMFileHeader header = null;
+
+ // randomness related variables
+ private static final long RANDOM_SEED = 1252863495;
+ private static final Random ran = GenomeAnalysisEngine.getRandomGenerator();
+ private Poisson poissonRandom = null;
+
+ // samples and read groups
+ private final Map sample2RG = new HashMap();
+
+ private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); }
+
+ private SAMReadGroupRecord createRG(String name) {
+ SAMReadGroupRecord rg = new SAMReadGroupRecord(name);
+ rg.setPlatform(rgPlatform.getDefaultPlatform());
+ rg.setSample(name);
+ return rg;
+ }
+
+ // class to store the bases, offset, and representative CIGAR of a haplotype
+ private static class ArtificialHaplotype {
+ public final byte[] bases;
+ public final int offset;
+ public final String cigar;
+
+ public ArtificialHaplotype(final byte[] bases, final int offset, final String cigar) {
+ this.bases = bases;
+ this.offset = offset;
+ this.cigar = cigar;
+ }
+ }
+
+ @Override
+ public void initialize() {
+
+ // initialize sample -> read group map
+ final List sampleRGs = new ArrayList();
+ for ( final String sample : SampleUtils.getUniqueSamplesFromRods(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) {
+ final SAMReadGroupRecord rg = createRG(sample);
+ sampleRGs.add(rg);
+ sample2RG.put(sample, rg);
+ }
+
+ // initialize BAM headers
+ header = new SAMFileHeader();
+ header.setSequenceDictionary(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary());
+ header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
+ header.setReadGroups(sampleRGs);
+
+ final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
+ if ( !NO_PG_TAG ) {
+ final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
+ programRecord.setProgramVersion(headerInfo.getString("org.broadinstitute.sting.gatk.version"));
+ programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
+ }
+ header.setProgramRecords(Arrays.asList(programRecord));
+
+ readWriter.setPresorted(false);
+ readWriter.writeHeader(header);
+
+ halfReadLength = readLength / 2;
+ errorRate = QualityUtils.qualToErrorProb((byte)phredErrorRate);
+ readQuals = new byte[readLength];
+ Arrays.fill(readQuals, (byte)phredErrorRate);
+ if ( samplingMode == ReadSamplingMode.POISSON )
+ poissonRandom = new Poisson(readDepth, new MersenneTwister((int)RANDOM_SEED));
+ }
+
+ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ if ( tracker == null ) // RodWalkers can make funky map calls
+ return 0;
+
+ if ( ref.getLocus().getStart() < readLength || ! BaseUtils.isRegularBase(ref.getBase()) )
+ return 0;
+
+ final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
+ if ( vc == null || !vc.isBiallelic() )
+ return 0;
+
+ if ( verbose ) logger.info(String.format("Generating reads for %s", vc));
+
+ generateReadsForVariant(vc, ref);
+
+ return 1;
+ }
+
+ /**
+ * Contstructs an artifical haplotype given an allele and original reference context
+ *
+ * @param allele the allele to model (can be reference)
+ * @param refLength the length of the reference allele
+ * @param ref the original reference context
+ * @return a non-null ArtificialHaplotype
+ */
+ private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) {
+
+ final byte[] haplotype = new byte[readLength];
+
+ final int alleleLength = allele.getBases().length;
+ final int halfAlleleLength = (alleleLength + 1) / 2;
+
+ // this is how far back to move from the event to start copying bases
+ final int offset = halfReadLength - halfAlleleLength;
+
+ // copy bases before the event
+ final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart());
+ int posOnRefContext = locusPosOnRefContext - offset;
+ System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset);
+ int copiedCount = offset;
+
+ // copy the event bases
+ System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength);
+ copiedCount += alleleLength;
+
+ // copy bases after the event
+ posOnRefContext = locusPosOnRefContext + refLength;
+ final int remainder = readLength - copiedCount;
+ System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder);
+
+ final String cigar;
+ if ( refLength == alleleLength )
+ cigar = readLength + "M";
+ else
+ cigar = (offset + 1) + "M" + Math.abs(refLength - alleleLength) + (refLength > alleleLength ? "D" : "I") + remainder + "M";
+
+ return new ArtificialHaplotype(haplotype, offset, cigar);
+ }
+
+ /**
+ * Generates the artificial reads for a given variant
+ *
+ * @param vc the (bi-allelic) variant context for which to generate artificial reads
+ * @param ref the original reference context
+ */
+ private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref) {
+
+ final int refLength = vc.getReference().getBases().length;
+ final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref);
+ final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref);
+
+ int gi = 0;
+ for ( final Genotype g : vc.getGenotypes() ) {
+ final int myDepth = sampleDepth();
+ for ( int d = 0; d < myDepth; d++ ) {
+
+ final ArtificialHaplotype haplotype = chooseRefHaplotype(g) ? refHap : altHap;
+ final byte[] readBases = Arrays.copyOf(haplotype.bases, readLength);
+
+ addMachineErrors(readBases, errorRate);
+ writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0);
+ }
+ }
+ }
+
+ /**
+ * Decides whether or not to choose the reference haplotype, depending on the given genotype
+ *
+ * @param g the genotype of the given sample
+ * @return true if one should use the reference haplotype, false otherwise
+ */
+ private boolean chooseRefHaplotype(final Genotype g) {
+ final double refP;
+ if ( g.isHomRef() ) refP = 1;
+ else if ( g.isHet() ) refP = 0.5;
+ else refP = 0.0;
+
+ return ran.nextDouble() < refP;
+ }
+
+ /**
+ * Generates the artificial read depth
+ *
+ * @return a non-negative int
+ */
+ private int sampleDepth() {
+ switch ( samplingMode ) {
+ case CONSTANT: return readDepth;
+ case POISSON: return poissonRandom.nextInt();
+ default:
+ throw new IllegalStateException("Unexpected DepthSamplingType " + samplingMode);
+ }
+ }
+
+ /**
+ * Creates and writes an artificial read given the appropriate data
+ *
+ * @param readBases the bases
+ * @param contig the contig
+ * @param start the read start
+ * @param cigar the cigar string
+ * @param sample the sample name (used to get the right read group)
+ * @param isNegStrand should this read be on the negative strand?
+ */
+ private void writeRead(final byte[] readBases, final String contig, final int start,
+ final String cigar, final String sample, final boolean isNegStrand) {
+ final GATKSAMRecord read = new GATKSAMRecord(header);
+ read.setBaseQualities(readQuals);
+ read.setReadBases(readBases);
+ read.setReadName("" + readNameCounter++);
+ read.setCigarString(cigar);
+ read.setReadPairedFlag(false);
+ read.setAlignmentStart(start);
+ read.setMappingQuality(60);
+ read.setReferenceName(contig);
+ read.setReadNegativeStrandFlag(isNegStrand);
+ read.setAttribute("RG", sampleRG(sample).getReadGroupId());
+
+ readWriter.addAlignment(read);
+ }
+
+ /**
+ * Adds machine errors at the appropriate rate to the provided read bases
+ *
+ * @param readBases the read bases
+ * @param errorRate the rate at which to produce errors
+ */
+ private void addMachineErrors(final byte[] readBases, final double errorRate) {
+ for ( int i = 0; i < readBases.length; i++ ) {
+ final double r = ran.nextDouble();
+ if ( r < errorRate ) {
+ byte errorBase = BaseUtils.baseIndexToSimpleBase(BaseUtils.getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(readBases[i])));
+ if ( errorBase == readBases[i] ) throw new IllegalStateException("Read and error bases are the same");
+ readBases[i] = errorBase;
+ }
+ }
+ }
+
+ @Override
+ public Integer reduceInit() {
+ return 0;
+ }
+
+ @Override
+ public Integer reduce(Integer counter, Integer sum) {
+ return counter + sum;
+ }
+}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java
new file mode 100644
index 000000000..2ae904e65
--- /dev/null
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java
@@ -0,0 +1,95 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.simulatereads;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+
+public class SimulateReadsForVariantsIntegrationTest extends WalkerTest {
+
+ @Test
+ public void testDefaults() {
+
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s",
+ 1,
+ Arrays.asList("dd9e17a9c268578e903ecd4ca0a4a335"));
+ executeTest("testVariants", spec);
+ }
+
+ @Test
+ public void testReadLength() {
+
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-RL 70 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s",
+ 1,
+ Arrays.asList("d7388376ffd4d3826d48a5be0be70632"));
+ executeTest("testReadLength", spec);
+ }
+
+ @Test
+ public void testErrorRate() {
+
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-ER 40 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s",
+ 1,
+ Arrays.asList("6c9bf583f4b2708d6b82f54516474b7b"));
+ executeTest("testErrorRate", spec);
+ }
+
+ @Test
+ public void testPlatformTag() {
+
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-RGPL SOLID -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forSimulation.vcf -o %s",
+ 1,
+ Arrays.asList("26db391f223ead74d786006a502029d8"));
+ executeTest("testPlatformTag", spec);
+ }
+}
diff --git a/public/testdata/forSimulation.vcf b/public/testdata/forSimulation.vcf
new file mode 100644
index 000000000..a0c57c2c0
--- /dev/null
+++ b/public/testdata/forSimulation.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.1
+##FORMAT=
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892
+20 10000000 . T C . . . GT 0/1 0/0 1/1
+20 10001000 . GG AA . . . GT 0/1 0/0 1/1
+20 10002000 . TAGTA T . . . GT 0/1 0/0 1/1
+20 10003000 . A AGCT . . . GT 0/1 0/0 1/1
+20 10004000 . GAT G,GATAT . . . GT 0/1 0/0 1/1
diff --git a/public/testdata/forSimulation.vcf.idx b/public/testdata/forSimulation.vcf.idx
new file mode 100644
index 0000000000000000000000000000000000000000..4f734b7afa596624ad9cab6a74698f1ddc801098
GIT binary patch
literal 2067
zcmb_d%T8256l`PMxbaJ5`aJq^+z}X+1Tm4(;A(;fk%(g=;KC1a?PvITRzEK3cI_ks
zm*U*&I&~hsIXXOh;GBDi@9Oj2*UO9D>hk9N>CMOQw7U3kzWZ{sI=#KxU2gAoH@D{>
zt}j-fzI{Kv`g(VLetY$8xA)`YCwE;K@cDE6>*qm#c4q*dulF}^->kRC>ysCom)lqC
z{Szm9@n8IObGSWRzjL%_Y7%me*&-$5+sBeG*APUMUHXu04-Y9VF5z$d1g!q_RL~<02B*)lE
zvm=cX6}&CN01%Vl>;{l`Bq4|GLZ#_ie^)GDf^UI7g}@3h_mJX3VRm?v;hzG6&R|S*zUC<
z)j;Ak#4i5~t^`g#EWW)5Cw#vTc(rj9L$pTt;@Er7KJ9^+W0&4QjsW4e@9H{g1vMBy
z5(ok1gkp>+`HIFo@k(KU!>nVhgn}w6GorrmN~Fc1ra_Bb6nd$1M=L`_B?ktw8k;Ef
zVzyW>g(0a;xoOE*GTLxpKr2@?_%REz7Hc%e<`QdZ_Cd9x#*^vNVofI0Gh)?~pZzTi
z>d{}LGHJ0fjLO&-WlG*pv07@OWHR%%Sd%%6;<_@1Z#+$lXDXsIZ`B}yXWn|6M4Muz
zwpqx_1h-m9l9Xub6t3LZB-ZyxZ>u^SnBjT6sB%Z5W$UzPE+$dLo|}B8;s>F-=f~{>w|ti0?XUmybGgO;0=(^;SO5S3
literal 0
HcmV?d00001
From c79e8ca53e40130fd705d4dd6007508d30ac12a5 Mon Sep 17 00:00:00 2001
From: Yossi Farjoun
Date: Wed, 15 Jan 2014 11:02:04 -0500
Subject: [PATCH 021/113] Added an info log containing the SAM/BAM files that
were eventually found from the commandline (useful for when there are files
hiding inside bam.lists which may or may not have been constructed
correctly...) Added a @hidden option controling the appearance of the full
BamList in the log
---
.../sting/gatk/CommandLineExecutable.java | 14 +++++++++++---
.../gatk/arguments/GATKArgumentCollection.java | 5 ++++-
2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index 111786e63..86ecaffe0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -26,8 +26,10 @@
package org.broadinstitute.sting.gatk;
import org.apache.log4j.Logger;
-import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor;
+import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
+import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor;
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor;
@@ -41,7 +43,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.ListFileUtils;
import java.security.PublicKey;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
/**
* @author aaron
@@ -87,7 +91,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
engine.setArguments(getArgumentCollection());
// File lists can require a bit of additional expansion. Set these explicitly by the engine.
- engine.setSAMFileIDs(ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser));
+ final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser);
+ engine.setSAMFileIDs(bamFileList);
+ if(getArgumentCollection().showFullBamList){
+ logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString()));
+ }
engine.setWalker(walker);
walker.setToolkit(engine);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 2bbc5482b..e86780eb4 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
@@ -63,6 +62,10 @@ public class GATKArgumentCollection {
@Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false)
public List samFiles = new ArrayList();
+ @Hidden
+ @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).")
+ public Boolean showFullBamList = false;
+
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0)
public Integer readBufferSize = null;
From 64d5bf650ec320996d24a4064fc87fb51846ecd1 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Wed, 11 Dec 2013 10:20:07 -0500
Subject: [PATCH 022/113] Pulled out some hard-coded values from the
read-threading and isActive code of the HC, and made them into a single
argument.
In unifying the arguments it was clear that the values were inconsistent throughout the code, so now there's a
single value that is intended to be more liberal in what it allows in (in an attempt to increase sensitivity).
Very little code actually changes here, but just about every md5 in the HC integration tests are different (as
expected). Added another integration test for the new argument.
To be used by David R to test his per-branch QC framework: does this commit make the HC look better against the KB?
---
.../haplotypecaller/HaplotypeCaller.java | 25 +++++++------
.../VariantAnnotatorIntegrationTest.java | 2 +-
...lexAndSymbolicVariantsIntegrationTest.java | 6 +--
.../HaplotypeCallerGVCFIntegrationTest.java | 13 +++----
.../HaplotypeCallerIntegrationTest.java | 37 +++++++++++--------
...aplotypeCallerParallelIntegrationTest.java | 2 +-
6 files changed, 45 insertions(+), 40 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 0bedf9062..90471842a 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -341,6 +341,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// general advanced arguments to control haplotype caller behavior
// -----------------------------------------------------------------------------------------------
+ /**
+ * The minimum confidence needed for a given base for it to be used in variant calling.
+ */
+ @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
+ public byte MIN_BASE_QUALTY_SCORE = 10;
+
/**
* Users should be aware that this argument can really affect the results of the variant calling and should exercise caution.
* Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the
@@ -440,10 +446,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
protected boolean debugGraphTransformations = false;
- @Hidden // TODO -- not currently useful
- @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
- protected boolean useLowQualityBasesForAssembly = false;
-
@Hidden
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
@@ -536,10 +538,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
- // bases with quality less than or equal to this value are trimmed off the tails of the reads
- private static final byte MIN_TAIL_QUALITY = 20;
-
+ private byte MIN_TAIL_QUALITY;
private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6;
+
// the minimum length of a read we'd consider using for genotyping
private final static int MIN_READ_LENGTH = 10;
@@ -651,9 +652,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths);
assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails);
+ assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE);
+
+ MIN_TAIL_QUALITY = (byte)(MIN_BASE_QUALTY_SCORE - 1);
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
- if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
// setup the likelihood calculation engine
if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1;
@@ -758,12 +761,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
for( final Map.Entry sample : splitContexts.entrySet() ) {
- final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods;
+ final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods;
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE);
- final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
+ final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
@@ -1113,8 +1116,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
GATKSAMRecord clippedRead;
if (errorCorrectReads)
clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
- else if (useLowQualityBasesForAssembly)
- clippedRead = myRead;
else // default case: clip low qual ends of reads
clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY );
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index 7943eb09b..cf22d941d 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
- final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("0c331915b07b42d726bc3d623aa9997b"));
+ final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("b171258ed3ef5ae0d6c21fe04e5940fc"));
specAnn.disableShadowBCF();
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
index 5769c3a51..2ea525108 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
@@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
- HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "ff19ae39b0695680ea670d53f6f9ce47");
+ HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b5cd204b9dd6a5210b49d91186cf2b1d");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
- "b787be740423b950f8529ccc838fabdd");
+ "cdf6d200324949a3484668774d2289d7");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
- "f74d68cbc1ecb66a7128258e111cd030");
+ "ccc3864207d700c00238066ec5ae33c5");
}
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
index 3f6151c71..b4bea0359 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java
@@ -65,13 +65,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
- tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "96328c91cf9b06de231b37a22a7a7639"});
- tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ac25e9a78b89655197513bb0eb7a6845"});
- tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "dc0dde72131d562587acae967cf2031f"});
- tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
- tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"});
- tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"});
-
+ tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "53aa13711a1ceec1453f21c705723f04"});
+ tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "4bb8e44b2d04757f8b11ca6400828357"});
+ tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "8ddc291584f56e27d125b6a0523f2703"});
+ tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"});
+ tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "e53e164cc3f5cbd5fba083f2cdb98a88"});
+ tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "a258dbbfabe88dad11d57151cd256831"});
return tests.toArray(new Object[][]{});
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index ba296f263..74149e8cf 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -84,22 +84,27 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
- HCTest(CEUTRIO_BAM, "", "f2ad35b5e0d181fb18da86a8971ce4f4");
+ HCTest(CEUTRIO_BAM, "", "abbfdcbf4bfed7547a48121091a7e16f");
}
@Test
public void testHaplotypeCallerSingleSample() {
- HCTest(NA12878_BAM, "", "06abde3268336a7cdb21970f12e819ba");
+ HCTest(NA12878_BAM, "", "96f299a5cf411900b8eda3845c3ce465");
+ }
+
+ @Test
+ public void testHaplotypeCallerMinBaseQuality() {
+ HCTest(NA12878_BAM, "-mbq 15", "6509cfd0554ecbb92a1b303fbcc0fcca");
}
@Test
public void testHaplotypeCallerGraphBasedSingleSample() {
- HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "3d1cb9acdf66547f88ad1742e8178044");
+ HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "83fe0694621bc1e0240f6f79eb6d6999");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() {
- HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "af6f1f504ad771201aedc0157de8830a");
+ HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "8b75034f9aa8435962da98eb521c8f0b");
}
@Test(enabled = false) // can't annotate the rsID's yet
@@ -110,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
- "fd43de437bbaf960499f67daedc6ef63");
+ "fc271f21c0693e4fa6394e27d722fb52");
}
@Test
@@ -126,7 +131,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
- HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a3bb5f0bcec603287520841c559638f");
+ HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "d3fc49d3d3c8b6439548133e03faa540");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@@ -163,7 +168,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
- HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2");
+ HCTestNearbySmallIntervals(NA12878_BAM, "", "a415bc76231a04dc38412ff38aa0dc49");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -173,7 +178,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
- final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("170896ddcfe06ec47e08aefefd99cf78"));
+ final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("763d4d8d84a4080db18235a413478660"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@@ -222,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
- Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
+ Arrays.asList("12c56262ed30db1249b8d722e324357c"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@@ -230,7 +235,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
- Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
+ Arrays.asList("1627cf5f3a97e8b73b3c095db46aef1b"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@@ -244,7 +249,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
- Arrays.asList("6ab05a77d2e79d21ba85fadf844a13ba"));
+ Arrays.asList("51e63c0431817ca1824b01e56341a8ae"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@@ -253,7 +258,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
- Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
+ Arrays.asList("e39c73bbaf22b4751755d9f5bb2a8d3d"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@@ -261,7 +266,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
- Arrays.asList("903af86b396ce88a6c8e4f4016fbe769"));
+ Arrays.asList("a2ada5984fe835f7f2169f8393d122a6"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@@ -270,7 +275,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
- Arrays.asList("69db1045b5445a4f90843f368bd62814"));
+ Arrays.asList("c14d7f23dedea7e7ec99a90843320c1a"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@@ -293,7 +298,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
- Arrays.asList("824188743703bc09225c5b9c6b404ac1"));
+ Arrays.asList("ee73759f4372df678e7aa97346d87a70"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@@ -301,7 +306,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
- Arrays.asList("14de866430f49c0026aafc1e34ed8250"));
+ Arrays.asList("a9fa660910bf5e35267475f3b2d75766"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java
index 21648b2b9..23513f314 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java
@@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List