alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
if(alignmentIterator.hasNext()) {
int numAlignments = alignmentIterator.next().length;
diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
index 98f2a9b5c..a399867fa 100755
--- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@@ -25,6 +25,9 @@
package org.broadinstitute.sting.analyzecovariates;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.CommandLineProgram;
@@ -33,14 +36,16 @@ import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
+import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.io.Resource;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
@@ -71,15 +76,13 @@ import java.util.regex.Pattern;
*
*
*
- * NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options
- * must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R,
- * not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using
- * this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball.
- * For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout.
+ * NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
+ * See http://www.r-project.org for more info on how to download and install R.
*
*
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
- * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
*
Input
*
@@ -91,7 +94,6 @@ import java.util.regex.Pattern;
* java -Xmx4g -jar AnalyzeCovariates.jar \
* -recalFile /path/to/recal.table.csv \
* -outputDir /path/to/output_dir/ \
- * -resources resources/ \
* -ignoreQ 5
*
*
@@ -101,6 +103,11 @@ import java.util.regex.Pattern;
groupName = "AnalyzeCovariates",
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
+ final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class);
+
+ private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R";
+ private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R";
+ private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R";
/////////////////////////////
// Command Line Arguments
@@ -114,11 +121,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
- private String OUTPUT_DIR = "analyzeCovariates/";
- @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.12.0/bin/Rscript", required = false)
- private String PATH_TO_RSCRIPT = "Rscript";
- @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
- private String PATH_TO_RESOURCES = "public/R/";
+ private File OUTPUT_DIR = new File("analyzeCovariates");
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
@@ -154,29 +157,26 @@ public class AnalyzeCovariates extends CommandLineProgram {
protected int execute() {
// create the output directory where all the data tables and plots will go
- try {
- Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
- } catch (IOException e) {
- System.out.println("Couldn't create directory: " + OUTPUT_DIR);
- System.out.println("User is responsible for making sure the output directory exists.");
- }
- if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
- if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
+ if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs())
+ throw new UserException.BadArgumentValue("--output_dir/-outDir", "Unable to create output directory: " + OUTPUT_DIR);
+
+ if (!RScriptExecutor.RSCRIPT_EXISTS)
+ Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated.");
// initialize all the data from the csv file and allocate the list of covariates
- System.out.println("Reading in input csv file...");
+ logger.info("Reading in input csv file...");
initializeData();
- System.out.println("...Done!");
+ logger.info("...Done!");
// output data tables for Rscript to read in
- System.out.println("Writing out intermediate tables for R...");
+ logger.info("Writing out intermediate tables for R...");
writeDataTables();
- System.out.println("...Done!");
+ logger.info("...Done!");
// perform the analysis using Rscript and output the plots
- System.out.println("Calling analysis R scripts and writing out figures...");
+ logger.info("Calling analysis R scripts and writing out figures...");
callRScripts();
- System.out.println("...Done!");
+ logger.info("...Done!");
return 0;
}
@@ -287,37 +287,40 @@ public class AnalyzeCovariates extends CommandLineProgram {
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
String readGroup = readGroupKey.toString();
RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey);
- System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" );
- System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
+ logger.info(String.format(
+ "Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f",
+ readGroup, readGroupDatum.getNumObservations(),
+ readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
// Create a PrintStream
- PrintStream output = null;
+ File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
+ PrintStream output;
try {
- output = new PrintStream(new FileOutputStream(OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"));
-
- } catch (FileNotFoundException e) {
- System.err.println("Can't create file: " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
- System.exit(-1);
+ output = new PrintStream(FileUtils.openOutputStream(outputFile));
+ } catch (IOException e) {
+ throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
- // Output the header
- output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
+ try {
+ // Output the header
+ output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
- for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) {
- output.print( covariateKey.toString() + "\t" ); // Covariate
- RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
- output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
- output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
- output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
- output.println( thisDatum.getNumObservations() ); // nBases
+ for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) {
+ output.print( covariateKey.toString() + "\t" ); // Covariate
+ RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
+ output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
+ output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
+ output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
+ output.println( thisDatum.getNumObservations() ); // nBases
+ }
+ } finally {
+ // Close the PrintStream
+ IOUtils.closeQuietly(output);
}
-
- // Close the PrintStream
- output.close();
}
} else {
break;
@@ -327,10 +330,6 @@ public class AnalyzeCovariates extends CommandLineProgram {
}
private void callRScripts() {
- RScriptExecutor.RScriptArgumentCollection argumentCollection =
- new RScriptExecutor.RScriptArgumentCollection(PATH_TO_RSCRIPT, Arrays.asList(PATH_TO_RESOURCES));
- RScriptExecutor executor = new RScriptExecutor(argumentCollection, true);
-
int numReadGroups = 0;
// for each read group
@@ -338,23 +337,32 @@ public class AnalyzeCovariates extends CommandLineProgram {
if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {
String readGroup = readGroupKey.toString();
- System.out.println("Analyzing read group: " + readGroup);
+ logger.info("Analyzing read group: " + readGroup);
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
- final String outputFilename = OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat";
+ final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
if (DO_INDEL_QUALITY) {
- executor.callRScripts("plot_indelQuality.R", outputFilename,
- cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
+ RScriptExecutor executor = new RScriptExecutor();
+ executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class));
+ // The second argument is the name of the covariate in order to make the plots look nice
+ executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
+ executor.exec();
} else {
if( iii == 1 ) {
// Analyze reported quality
- executor.callRScripts("plot_residualError_QualityScoreCovariate.R", outputFilename,
- IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
+ RScriptExecutor executor = new RScriptExecutor();
+ executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE, AnalyzeCovariates.class));
+ // The second argument is the Q scores that should be turned pink in the plot because they were ignored
+ executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE);
+ executor.exec();
} else { // Analyze all other covariates
- executor.callRScripts("plot_residualError_OtherCovariate.R", outputFilename,
- cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
+ RScriptExecutor executor = new RScriptExecutor();
+ executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class));
+ // The second argument is the name of the covariate in order to make the plots look nice
+ executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
+ executor.exec();
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
index 351583c07..c0823e5c5 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
@@ -46,7 +46,7 @@ public class ArgumentMatch implements Iterable {
/**
* Maps indices of command line arguments to values paired with that argument.
*/
- public final SortedMap> indices = new TreeMap>();
+ public final SortedMap> sites = new TreeMap>();
/**
* An ordered, freeform collection of tags.
@@ -72,32 +72,32 @@ public class ArgumentMatch implements Iterable {
}
/**
- * A simple way of indicating that an argument with the given label and definition exists at this index.
+ * A simple way of indicating that an argument with the given label and definition exists at this site.
* @param label Label of the argument match. Must not be null.
* @param definition The associated definition, if one exists. May be null.
- * @param index Position of the argument. Must not be null.
+ * @param site Position of the argument. Must not be null.
* @param tags ordered freeform text tags associated with this argument.
*/
- public ArgumentMatch(final String label, final ArgumentDefinition definition, final int index, final Tags tags) {
- this( label, definition, index, null, tags );
+ public ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final Tags tags) {
+ this( label, definition, site, null, tags );
}
/**
- * A simple way of indicating that an argument with the given label and definition exists at this index.
+ * A simple way of indicating that an argument with the given label and definition exists at this site.
* @param label Label of the argument match. Must not be null.
* @param definition The associated definition, if one exists. May be null.
- * @param index Position of the argument. Must not be null.
+ * @param site Position of the argument. Must not be null.
* @param value Value for the argument at this position.
* @param tags ordered freeform text tags associated with this argument.
*/
- private ArgumentMatch(final String label, final ArgumentDefinition definition, final int index, final String value, final Tags tags) {
+ private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final String value, final Tags tags) {
this.label = label;
this.definition = definition;
ArrayList values = new ArrayList();
if( value != null )
values.add(value);
- indices.put(index,values );
+ sites.put(site,values );
this.tags = tags;
}
@@ -117,7 +117,7 @@ public class ArgumentMatch implements Iterable {
ArgumentMatch otherArgumentMatch = (ArgumentMatch)other;
return this.definition.equals(otherArgumentMatch.definition) &&
this.label.equals(otherArgumentMatch.label) &&
- this.indices.equals(otherArgumentMatch.indices) &&
+ this.sites.equals(otherArgumentMatch.sites) &&
this.tags.equals(otherArgumentMatch.tags);
}
@@ -129,16 +129,17 @@ public class ArgumentMatch implements Iterable {
* @param key Key which specifies the transform.
* @return A variant of this ArgumentMatch with all keys transformed.
*/
+ @SuppressWarnings("unchecked")
ArgumentMatch transform(Multiplexer multiplexer, Object key) {
- SortedMap> newIndices = new TreeMap>();
- for(Map.Entry> index: indices.entrySet()) {
+ SortedMap> newIndices = new TreeMap>();
+ for(Map.Entry> site: sites.entrySet()) {
List newEntries = new ArrayList();
- for(String entry: index.getValue())
+ for(String entry: site.getValue())
newEntries.add(multiplexer.transformArgument(key,entry));
- newIndices.put(index.getKey(),newEntries);
+ newIndices.put(site.getKey(),newEntries);
}
ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition);
- newArgumentMatch.indices.putAll(newIndices);
+ newArgumentMatch.sites.putAll(newIndices);
return newArgumentMatch;
}
@@ -157,9 +158,9 @@ public class ArgumentMatch implements Iterable {
public Iterator iterator() {
return new Iterator() {
/**
- * Iterate over each the available index.
+ * Iterate over each the available site.
*/
- private Iterator indexIterator = null;
+ private Iterator siteIterator = null;
/**
* Iterate over each available token.
@@ -167,9 +168,9 @@ public class ArgumentMatch implements Iterable {
private Iterator tokenIterator = null;
/**
- * The next index to return. Null if none remain.
+ * The next site to return. Null if none remain.
*/
- Integer nextIndex = null;
+ ArgumentMatchSite nextSite = null;
/**
* The next token to return. Null if none remain.
@@ -177,7 +178,7 @@ public class ArgumentMatch implements Iterable {
String nextToken = null;
{
- indexIterator = indices.keySet().iterator();
+ siteIterator = sites.keySet().iterator();
prepareNext();
}
@@ -186,7 +187,7 @@ public class ArgumentMatch implements Iterable {
* @return True if there's another token waiting in the wings. False otherwise.
*/
public boolean hasNext() {
- return nextToken != null;
+ return nextToken != null;
}
/**
@@ -194,32 +195,32 @@ public class ArgumentMatch implements Iterable {
* @return The next ArgumentMatch in the series. Should never be null.
*/
public ArgumentMatch next() {
- if( nextIndex == null || nextToken == null )
+ if( nextSite == null || nextToken == null )
throw new IllegalStateException( "No more ArgumentMatches are available" );
- ArgumentMatch match = new ArgumentMatch( label, definition, nextIndex, nextToken, tags );
+ ArgumentMatch match = new ArgumentMatch( label, definition, nextSite, nextToken, tags );
prepareNext();
return match;
}
/**
* Initialize the next ArgumentMatch to return. If no ArgumentMatches are available,
- * initialize nextIndex / nextToken to null.
+ * initialize nextSite / nextToken to null.
*/
private void prepareNext() {
if( tokenIterator != null && tokenIterator.hasNext() ) {
nextToken = tokenIterator.next();
}
else {
- nextIndex = null;
+ nextSite = null;
nextToken = null;
// Do a nested loop. While more data is present in the inner loop, grab that data.
// Otherwise, troll the outer iterator looking for more data.
- while( indexIterator.hasNext() ) {
- nextIndex = indexIterator.next();
- if( indices.get(nextIndex) != null ) {
- tokenIterator = indices.get(nextIndex).iterator();
+ while( siteIterator.hasNext() ) {
+ nextSite = siteIterator.next();
+ if( sites.get(nextSite) != null ) {
+ tokenIterator = sites.get(nextSite).iterator();
if( tokenIterator.hasNext() ) {
nextToken = tokenIterator.next();
break;
@@ -245,29 +246,29 @@ public class ArgumentMatch implements Iterable {
* @param other The other match to merge into.
*/
public void mergeInto( ArgumentMatch other ) {
- indices.putAll(other.indices);
+ sites.putAll(other.sites);
}
/**
* Associate a value with this merge maapping.
- * @param index index of the command-line argument to which this value is mated.
+ * @param site site of the command-line argument to which this value is mated.
* @param value Text representation of value to add.
*/
- public void addValue( int index, String value ) {
- if( !indices.containsKey(index) || indices.get(index) == null )
- indices.put(index, new ArrayList() );
- indices.get(index).add(value);
+ public void addValue( ArgumentMatchSite site, String value ) {
+ if( !sites.containsKey(site) || sites.get(site) == null )
+ sites.put(site, new ArrayList() );
+ sites.get(site).add(value);
}
/**
* Does this argument already have a value at the given site?
* Arguments are only allowed to be single-valued per site, and
* flags aren't allowed a value at all.
- * @param index Index at which to check for values.
+ * @param site Site at which to check for values.
* @return True if the argument has a value at the given site. False otherwise.
*/
- public boolean hasValueAtSite( int index ) {
- return (indices.get(index) != null && indices.get(index).size() >= 1) || isArgumentFlag();
+ public boolean hasValueAtSite( ArgumentMatchSite site ) {
+ return (sites.get(site) != null && sites.get(site).size() >= 1) || isArgumentFlag();
}
/**
@@ -276,9 +277,9 @@ public class ArgumentMatch implements Iterable {
*/
public List values() {
List values = new ArrayList();
- for( int index: indices.keySet() ) {
- if( indices.get(index) != null )
- values.addAll(indices.get(index));
+ for( ArgumentMatchSite site: sites.keySet() ) {
+ if( sites.get(site) != null )
+ values.addAll(sites.get(site));
}
return values;
}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java
new file mode 100644
index 000000000..8a4120101
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.commandline;
+
+/**
+ * Which source and the index within the source where an argument match was found.
+ */
+public class ArgumentMatchSite implements Comparable {
+ private final ArgumentMatchSource source;
+ private final int index;
+
+ public ArgumentMatchSite(ArgumentMatchSource source, int index) {
+ this.source = source;
+ this.index = index;
+ }
+
+ public ArgumentMatchSource getSource() {
+ return source;
+ }
+
+ public int getIndex() {
+ return index;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ ArgumentMatchSite that = (ArgumentMatchSite) o;
+
+ return (index == that.index) && (source == null ? that.source == null : source.equals(that.source));
+ }
+
+ @Override
+ public int hashCode() {
+ int result = source != null ? source.hashCode() : 0;
+ // Generated by intellij. No other special reason to this implementation. -ks
+ result = 31 * result + index;
+ return result;
+ }
+
+ @Override
+ public int compareTo(ArgumentMatchSite that) {
+ int comp = this.source.compareTo(that.source);
+ if (comp != 0)
+ return comp;
+
+ // Both files are the same.
+ if (this.index == that.index)
+ return 0;
+ return this.index < that.index ? -1 : 1;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java
new file mode 100644
index 000000000..ed2700006
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+
+/**
+ * Where an argument match originated, via the commandline or a file.
+ */
+public class ArgumentMatchSource implements Comparable {
+ public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null);
+
+ private final ArgumentMatchSourceType type;
+ private final File file;
+
+ /**
+ * Creates an argument match source from the specified file.
+ * @param file File specifying the arguments. Must not be null.
+ */
+ public ArgumentMatchSource(File file) {
+ this(ArgumentMatchSourceType.File, file);
+ }
+
+ private ArgumentMatchSource(ArgumentMatchSourceType type, File file) {
+ if (type == ArgumentMatchSourceType.File && file == null)
+ throw new IllegalArgumentException("An argument match source of type File cannot have a null file.");
+ this.type = type;
+ this.file = file;
+ }
+
+ public ArgumentMatchSourceType getType() {
+ return type;
+ }
+
+ public File getFile() {
+ return file;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ ArgumentMatchSource that = (ArgumentMatchSource) o;
+
+ return (type == that.type) && (file == null ? that.file == null : file.equals(that.file));
+ }
+
+ @Override
+ public int hashCode() {
+ int result = type != null ? type.hashCode() : 0;
+ result = 31 * result + (file != null ? file.hashCode() : 0);
+ return result;
+ }
+
+ /**
+ * Compares two sources, putting the command line first, then files.
+ */
+ @Override
+ public int compareTo(ArgumentMatchSource that) {
+ int comp = this.type.compareTo(that.type);
+ if (comp != 0)
+ return comp;
+
+ File f1 = this.file;
+ File f2 = that.file;
+
+ if ((f1 == null) ^ (f2 == null)) {
+ // If one of the files is null and the other is not
+ // put the null file first
+ return f1 == null ? -1 : 1;
+ }
+
+ return f1 == null ? 0 : f1.compareTo(f2);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java
new file mode 100644
index 000000000..3ff6e21d4
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.commandline;
+
+/**
+ * Type of where an argument match originated, via the commandline or a file.
+ */
+public enum ArgumentMatchSourceType {
+ CommandLine, File
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java
index 52d3b8232..3da28c420 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java
@@ -37,7 +37,7 @@ public class ArgumentMatches implements Iterable {
* Collection matches from argument definition to argument value.
* Package protected access is deliberate.
*/
- Map argumentMatches = new TreeMap();
+ Map argumentMatches = new TreeMap();
/**
* Provide a place to put command-line argument values that don't seem to belong to
@@ -80,7 +80,7 @@ public class ArgumentMatches implements Iterable {
* @param site Site at which to check.
* @return True if the site has a match. False otherwise.
*/
- boolean hasMatch( int site ) {
+ boolean hasMatch( ArgumentMatchSite site ) {
return argumentMatches.containsKey( site );
}
@@ -90,7 +90,7 @@ public class ArgumentMatches implements Iterable {
* @return The match present at the given site.
* @throws IllegalArgumentException if site does not contain a match.
*/
- ArgumentMatch getMatch( int site ) {
+ ArgumentMatch getMatch( ArgumentMatchSite site ) {
if( !argumentMatches.containsKey(site) )
throw new IllegalArgumentException( "Site does not contain an argument: " + site );
return argumentMatches.get(site);
@@ -107,6 +107,7 @@ public class ArgumentMatches implements Iterable {
/**
* Return all argument matches of this source.
+ * @param parsingEngine Parsing engine.
* @param argumentSource Argument source to match.
* @return List of all matches.
*/
@@ -167,6 +168,7 @@ public class ArgumentMatches implements Iterable {
* TODO: Generify this.
* @param multiplexer Multiplexer that controls the transformation process.
* @param key Key which specifies the transform.
+ * @return new argument matches.
*/
ArgumentMatches transform(Multiplexer multiplexer, Object key) {
ArgumentMatches newArgumentMatches = new ArgumentMatches();
@@ -187,15 +189,15 @@ public class ArgumentMatches implements Iterable {
for( ArgumentMatch argumentMatch: getUniqueMatches() ) {
if( argumentMatch.definition == match.definition && argumentMatch.tags.equals(match.tags) ) {
argumentMatch.mergeInto( match );
- for( int index: match.indices.keySet() )
- argumentMatches.put( index, argumentMatch );
+ for( ArgumentMatchSite site: match.sites.keySet() )
+ argumentMatches.put( site, argumentMatch );
definitionExists = true;
}
}
if( !definitionExists ) {
- for( int index: match.indices.keySet() )
- argumentMatches.put( index, match );
+ for( ArgumentMatchSite site: match.sites.keySet() )
+ argumentMatches.put( site, match );
}
}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index d1d9cf7fe..31212a46f 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -336,6 +336,28 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
+ return parse(parsingEngine, source, type, matches, false);
+ }
+
+ /**
+ * The actual argument parsing method.
+ *
+ * IMPORTANT NOTE: the createIntervalBinding argument is a bit of a hack, but after discussions with SE we've decided
+ * that it's the best way to proceed for now. IntervalBindings can either be proper RodBindings (hence the use of
+ * this parse() method) or can be Strings (representing raw intervals or the files containing them). If createIntervalBinding
+ * is true, we do not call parsingEngine.addRodBinding() because we don't want walkers to assume that these are the
+ * usual set of RodBindings. It also allows us in the future to be smart about tagging rods as intervals. One other
+ * side point is that we want to continue to allow the usage of non-Feature intervals so that users can theoretically
+ * continue to input them out of order (whereas Tribble Features are ordered).
+ *
+ * @param parsingEngine parsing engine
+ * @param source source
+ * @param type type to check
+ * @param matches matches
+ * @param createIntervalBinding should we attempt to create an IntervalBinding instead of a RodBinding?
+ * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding.
+ */
+ public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches, boolean createIntervalBinding) {
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
String value = getArgumentValue( defaultDefinition, matches );
Class extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
@@ -348,7 +370,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
if ( tags.getPositionalTags().size() > 2 ) {
throw new UserException.CommandLineException(
String.format("Unexpected number of positional tags for argument %s : %s. " +
- "Rod bindings only suport -X:type and -X:name,type argument styles",
+ "Rod bindings only support -X:type and -X:name,type argument styles",
value, source.field.getName()));
} if ( tags.getPositionalTags().size() == 2 ) {
// -X:name,type style
@@ -378,7 +400,12 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
}
- if ( tribbleType == null )
+ if ( tribbleType == null ) {
+ // IntervalBindings allow streaming conversion of Strings
+ if ( createIntervalBinding ) {
+ return new IntervalBinding(value);
+ }
+
if ( ! file.exists() ) {
throw new UserException.CouldNotReadInputFile(file, "file does not exist");
} else if ( ! file.canRead() || ! file.isFile() ) {
@@ -389,13 +416,20 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
manager.userFriendlyListOfAvailableFeatures(parameterType)));
}
+ }
}
}
Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
- RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
- parsingEngine.addTags(result,tags);
- parsingEngine.addRodBinding(result);
+ Object result;
+ if ( createIntervalBinding ) {
+ result = ctor.newInstance(parameterType, name, value, tribbleType, tags);
+ } else {
+ RodBinding rbind = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
+ parsingEngine.addTags(rbind, tags);
+ parsingEngine.addRodBinding(rbind);
+ result = rbind;
+ }
return result;
} catch (InvocationTargetException e) {
throw new UserException.CommandLineException(
@@ -409,6 +443,39 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
}
+/**
+ * Parser for RodBinding objects
+ */
+class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
+ /**
+ * We only want IntervalBinding class objects
+ * @param type The type to check.
+ * @return true if the provided class is an IntervalBinding.class
+ */
+ @Override
+ public boolean supports( Class type ) {
+ return isIntervalBinding(type);
+ }
+
+ public static boolean isIntervalBinding( Class type ) {
+ return IntervalBinding.class.isAssignableFrom(type);
+ }
+
+ /**
+ * See note from RodBindingArgumentTypeDescriptor.parse().
+ *
+ * @param parsingEngine parsing engine
+ * @param source source
+ * @param type type to check
+ * @param matches matches
+ * @return the IntervalBinding object.
+ */
+ @Override
+ public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
+ return new RodBindingArgumentTypeDescriptor().parse(parsingEngine, source, type, matches, true);
+ }
+}
+
/**
* Parse simple argument types: java primitives, wrapper classes, and anything that has
* a simple String constructor.
@@ -416,7 +483,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public boolean supports( Class type ) {
- if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) ) return false;
+ if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) || IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ) return false;
if ( type.isPrimitive() ) return true;
if ( type.isEnum() ) return true;
if ( primitiveToWrapperMap.containsValue(type) ) return true;
diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
index d88e7030e..bed1e710e 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
@@ -35,10 +35,7 @@ import org.broadinstitute.sting.utils.help.ApplicationDetails;
import org.broadinstitute.sting.utils.help.HelpFormatter;
import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.Locale;
+import java.util.*;
public abstract class CommandLineProgram {
@@ -155,6 +152,7 @@ public abstract class CommandLineProgram {
*
* @param clp the command line program to execute
* @param args the command line arguments passed in
+ * @param dryRun dry run
* @throws Exception when an exception occurs
*/
@SuppressWarnings("unchecked")
@@ -176,6 +174,8 @@ public abstract class CommandLineProgram {
ParsingEngine parser = clp.parser = new ParsingEngine(clp);
parser.addArgumentSource(clp.getClass());
+ Map> parsedArgs;
+
// process the args
if (clp.canAddArgumentsDynamically()) {
// if the command-line program can toss in extra args, fetch them and reparse the arguments.
@@ -196,14 +196,14 @@ public abstract class CommandLineProgram {
Class[] argumentSources = clp.getArgumentSources();
for (Class argumentSource : argumentSources)
parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource);
- parser.parse(args);
+ parsedArgs = parser.parse(args);
if (isHelpPresent(parser))
printHelpAndExit(clp, parser);
if ( ! dryRun ) parser.validate();
} else {
- parser.parse(args);
+ parsedArgs = parser.parse(args);
if ( ! dryRun ) {
if (isHelpPresent(parser))
@@ -230,7 +230,7 @@ public abstract class CommandLineProgram {
}
// regardless of what happens next, generate the header information
- HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args);
+ HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs);
// call the execute
CommandLineProgram.result = clp.execute();
diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
new file mode 100644
index 000000000..86ca6c2df
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.commandline;
+
+import com.google.java.contract.Requires;
+import org.broad.tribble.Feature;
+import org.broad.tribble.FeatureCodec;
+import org.broad.tribble.readers.AsciiLineReader;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
+import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.interval.IntervalUtils;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string.
+ *
+ * The IntervalBinding is a formal GATK argument that bridges between a walker and
+ * the engine to construct intervals for traversal at runtime. The IntervalBinding can
+ * either be a RodBinding, a string of one or more intervals, or a file with interval strings.
+ * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
+ *
+ * Note that this class is immutable.
+ */
+public final class IntervalBinding {
+
+ private RodBinding featureIntervals;
+ private String stringIntervals;
+
+ @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"})
+ public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) {
+ featureIntervals = new RodBinding(type, rawName, source, tribbleType, tags);
+ }
+
+ @Requires({"intervalArgument != null"})
+ public IntervalBinding(String intervalArgument) {
+ stringIntervals = intervalArgument;
+ }
+
+ public String getSource() {
+ if ( featureIntervals != null )
+ return featureIntervals.getSource();
+ return stringIntervals;
+ }
+
+ public List getIntervals(GenomeAnalysisEngine toolkit) {
+ List intervals;
+
+ if ( featureIntervals != null ) {
+ intervals = new ArrayList();
+
+ //RMDTrackBuilder builder = new RMDTrackBuilder(toolkit.getReferenceDataSource().getReference().getSequenceDictionary(),
+ // toolkit.getGenomeLocParser(),
+ // toolkit.getArguments().unsafe);
+
+ // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
+
+ FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
+ if ( codec instanceof ReferenceDependentFeatureCodec )
+ ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
+ try {
+ FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
+ AsciiLineReader lineReader = new AsciiLineReader(fis);
+ codec.readHeader(lineReader);
+ String line = lineReader.readLine();
+ while ( line != null ) {
+ intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line)));
+ line = lineReader.readLine();
+ }
+ } catch (IOException e) {
+ throw new UserException("Problem reading the interval file " + featureIntervals.getSource() + "; " + e.getMessage());
+ }
+
+ } else {
+ intervals = IntervalUtils.parseIntervalArguments(toolkit.getGenomeLocParser(), stringIntervals);
+ }
+
+ return intervals;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
index fbf8c6516..0fac195e1 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
@@ -26,6 +26,7 @@
package org.broadinstitute.sting.commandline;
import com.google.java.contract.Requires;
+import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.JVMUtils;
@@ -35,6 +36,8 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.ApplicationDetails;
import org.broadinstitute.sting.utils.help.HelpFormatter;
+import java.io.File;
+import java.io.IOException;
import java.lang.reflect.Field;
import java.util.*;
@@ -75,6 +78,7 @@ public class ParsingEngine {
* The type of set used must be ordered (but not necessarily sorted).
*/
private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(),
+ new IntervalBindingArgumentTypeDescriptor(),
new RodBindingArgumentTypeDescriptor(),
new CompoundArgumentTypeDescriptor(),
new MultiplexArgumentTypeDescriptor()) );
@@ -100,6 +104,8 @@ public class ParsingEngine {
if(clp != null)
argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors());
argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS);
+
+ addArgumentSource(ParsingEngineArgumentFiles.class);
}
/**
@@ -148,21 +154,43 @@ public class ParsingEngine {
* command-line arguments to the arguments that are actually
* required.
* @param tokens Tokens passed on the command line.
+ * @return The parsed arguments by file.
*/
- public void parse( String[] tokens ) {
+ public SortedMap> parse( String[] tokens ) {
argumentMatches = new ArgumentMatches();
+ SortedMap> parsedArgs = new TreeMap>();
- int lastArgumentMatchSite = -1;
+ List cmdLineTokens = Arrays.asList(tokens);
+ parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs);
- for( int i = 0; i < tokens.length; i++ ) {
- String token = tokens[i];
+ ParsingEngineArgumentFiles argumentFiles = new ParsingEngineArgumentFiles();
+
+ // Load the arguments ONLY into the argument files.
+ // Validation may optionally run on the rest of the arguments.
+ loadArgumentsIntoObject(argumentFiles);
+
+ for (File file: argumentFiles.files) {
+ List fileTokens = getArguments(file);
+ parse(new ArgumentMatchSource(file), fileTokens, argumentMatches, parsedArgs);
+ }
+
+ return parsedArgs;
+ }
+
+ private void parse(ArgumentMatchSource matchSource, List tokens,
+ ArgumentMatches argumentMatches, SortedMap> parsedArgs) {
+ ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1);
+
+ int i = 0;
+ for (String token: tokens) {
// If the token is of argument form, parse it into its own argument match.
// Otherwise, pair it with the most recently used argument discovered.
+ ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i);
if( isArgumentForm(token) ) {
- ArgumentMatch argumentMatch = parseArgument( token, i );
+ ArgumentMatch argumentMatch = parseArgument( token, site );
if( argumentMatch != null ) {
argumentMatches.mergeInto( argumentMatch );
- lastArgumentMatchSite = i;
+ lastArgumentMatchSite = site;
}
}
else {
@@ -170,10 +198,31 @@ public class ParsingEngine {
!argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, token );
else
- argumentMatches.MissingArgument.addValue( i, token );
+ argumentMatches.MissingArgument.addValue( site, token );
}
+ i++;
}
+
+ parsedArgs.put(matchSource, tokens);
+ }
+
+ private List getArguments(File file) {
+ try {
+ if (file.getAbsolutePath().endsWith(".list")) {
+ return getListArguments(file);
+ }
+ } catch (IOException e) {
+ throw new UserException.CouldNotReadInputFile(file, e);
+ }
+ throw new UserException.CouldNotReadInputFile(file, "file extension is not .list");
+ }
+
+ private List getListArguments(File file) throws IOException {
+ ArrayList argsList = new ArrayList();
+ for (String line: FileUtils.readLines(file))
+ argsList.addAll(Arrays.asList(Utils.escapeExpressions(line)));
+ return argsList;
}
public enum ValidationType { MissingRequiredArgument,
@@ -494,7 +543,7 @@ public class ParsingEngine {
* @param position The position of the token in question.
* @return ArgumentMatch associated with this token, or null if no match exists.
*/
- private ArgumentMatch parseArgument( String token, int position ) {
+ private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) {
if( !isArgumentForm(token) )
throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token );
@@ -579,9 +628,21 @@ class UnmatchedArgumentException extends ArgumentException {
private static String formatArguments( ArgumentMatch invalidValues ) {
StringBuilder sb = new StringBuilder();
- for( int index: invalidValues.indices.keySet() )
- for( String value: invalidValues.indices.get(index) ) {
- sb.append( String.format("%nInvalid argument value '%s' at position %d.", value, index) );
+ for( ArgumentMatchSite site: invalidValues.sites.keySet() )
+ for( String value: invalidValues.sites.get(site) ) {
+ switch (site.getSource().getType()) {
+ case CommandLine:
+ sb.append( String.format("%nInvalid argument value '%s' at position %d.",
+ value, site.getIndex()) );
+ break;
+ case File:
+ sb.append( String.format("%nInvalid argument value '%s' in file %s at position %d.",
+ value, site.getSource().getFile().getAbsolutePath(), site.getIndex()) );
+ break;
+ default:
+ throw new RuntimeException( String.format("Unexpected argument match source type: %s",
+ site.getSource().getType()));
+ }
if(value != null && Utils.dupString(' ',value.length()).equals(value))
sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace.");
}
@@ -634,4 +695,13 @@ class UnknownEnumeratedValueException extends ArgumentException {
private static String formatArguments(ArgumentDefinition definition, String argumentPassed) {
return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions));
}
-}
\ No newline at end of file
+}
+
+/**
+ * Container class to store the list of argument files.
+ * The files will be parsed after the command line arguments.
+ */
+class ParsingEngineArgumentFiles {
+ @Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
+ public List files = new ArrayList();
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java
index a070cb5a1..452309e89 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java
@@ -68,7 +68,7 @@ public abstract class ParsingMethod {
* @return An argument match. Definition field will be populated if a match was found or
* empty if no appropriate definition could be found.
*/
- public ArgumentMatch match( ArgumentDefinitions definitions, String token, int position ) {
+ public ArgumentMatch match( ArgumentDefinitions definitions, String token, ArgumentMatchSite position ) {
// If the argument is valid, parse out the argument.
Matcher matcher = pattern.matcher(token);
@@ -102,9 +102,7 @@ public abstract class ParsingMethod {
// Try to find a matching argument. If found, label that as the match. If not found, add the argument
// with a null definition.
- ArgumentMatch argumentMatch = new ArgumentMatch(argument,argumentDefinition,position,tags);
-
- return argumentMatch;
+ return new ArgumentMatch(argument,argumentDefinition,position,tags);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index 5b9ebd99b..f8e87aa58 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -28,34 +28,30 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
+import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
-import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
+import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
-import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
-import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
+import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.GenomeLocSortedSet;
-import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
+import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.util.*;
@@ -92,7 +88,7 @@ public class GenomeAnalysisEngine {
/**
* Accessor for sample metadata
*/
- private SampleDataSource sampleDataSource = null;
+ private SampleDB sampleDB = null;
/**
* Accessor for sharded reference-ordered data.
@@ -206,6 +202,9 @@ public class GenomeAnalysisEngine {
// Prepare the data for traversal.
initializeDataSources();
+ // initialize sampleDB
+ initializeSampleDB();
+
// initialize and validate the interval list
initializeIntervals();
validateSuppliedIntervals();
@@ -222,12 +221,12 @@ public class GenomeAnalysisEngine {
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
// execute the microscheduler, storing the results
- Object result = microScheduler.execute(this.walker, shardStrategy);
+ return microScheduler.execute(this.walker, shardStrategy);
//monitor.stop();
//logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
- return result;
+ //return result;
}
/**
@@ -259,13 +258,12 @@ public class GenomeAnalysisEngine {
* @return A collection of available filters.
*/
public Collection createFilters() {
- Set filters = new HashSet();
- filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager()));
+ final List filters = WalkerManager.getReadFilters(walker,this.getFilterManager());
if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0)
filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList));
- for(String filterName: this.getArguments().readFilters)
+ for(final String filterName: this.getArguments().readFilters)
filters.add(this.getFilterManager().createByName(filterName));
- return Collections.unmodifiableSet(filters);
+ return Collections.unmodifiableList(filters);
}
/**
@@ -299,10 +297,14 @@ public class GenomeAnalysisEngine {
else if(WalkerManager.getDownsamplingMethod(walker) != null)
method = WalkerManager.getDownsamplingMethod(walker);
else
- method = argCollection.getDefaultDownsamplingMethod();
+ method = GATKArgumentCollection.getDefaultDownsamplingMethod();
return method;
}
+ protected void setDownsamplingMethod(DownsamplingMethod method) {
+ argCollection.setDownsamplingMethod(method);
+ }
+
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
@@ -381,18 +383,18 @@ public class GenomeAnalysisEngine {
// If intervals is non-null and empty at this point, it means that the list of intervals to process
// was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since
// this was very likely unintentional, the user should be informed of this. Note that this is different
- // from the case where intervals == null, which indicates either that there were no interval arguments,
- // or that -L all was specified.
+ // from the case where intervals == null, which indicates that there were no interval arguments.
if ( intervals != null && intervals.isEmpty() ) {
- throw new ArgumentException("The given combination of -L and -XL options results in an empty set. " +
- "No intervals to process.");
+ logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process.");
}
}
/**
* Get the sharding strategy given a driving data source.
*
+ * @param readsDataSource readsDataSource
* @param drivingDataSource Data on which to shard.
+ * @param intervals intervals
* @return the sharding strategy
*/
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
@@ -429,7 +431,7 @@ public class GenomeAnalysisEngine {
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
}
- ShardStrategy shardStrategy = null;
+ ShardStrategy shardStrategy;
ShardStrategyFactory.SHATTER_STRATEGY shardType;
long SHARD_SIZE = 100000L;
@@ -438,6 +440,8 @@ public class GenomeAnalysisEngine {
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
if (intervals != null && !intervals.isEmpty()) {
+ if (readsDataSource == null)
+ throw new IllegalArgumentException("readsDataSource is null");
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
@@ -501,7 +505,8 @@ public class GenomeAnalysisEngine {
*/
private void initializeTempDirectory() {
File tempDir = new File(System.getProperty("java.io.tmpdir"));
- tempDir.mkdirs();
+ if (!tempDir.exists() && !tempDir.mkdirs())
+ throw new UserException.BadTmpDir("Unable to create directory");
}
/**
@@ -566,34 +571,23 @@ public class GenomeAnalysisEngine {
protected void initializeIntervals() {
// return if no interval arguments at all
- if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
+ if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
return;
- // if '-L all' was specified, verify that it was the only -L specified and return if so.
- if(argCollection.intervals != null) {
- for(String interval: argCollection.intervals) {
- if(interval.trim().equals("all")) {
- if(argCollection.intervals.size() > 1)
- throw new UserException("'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals.");
-
- // '-L all' was specified and seems valid. Return.
- return;
- }
- }
- }
+ // Note that the use of '-L all' is no longer supported.
// if include argument isn't given, create new set of all possible intervals
- GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
+ GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
- loadIntervals(argCollection.intervals, IntervalUtils.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging)));
+ loadIntervals(argCollection.intervals, argCollection.intervalSetRule));
// if no exclude arguments, can return parseIntervalArguments directly
- if (argCollection.excludeIntervals == null)
+ if ( argCollection.excludeIntervals == null )
intervals = includeSortedSet;
- // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
+ // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
- GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, null);
+ GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, IntervalSetRule.UNION);
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
@@ -608,47 +602,26 @@ public class GenomeAnalysisEngine {
/**
* Loads the intervals relevant to the current execution
- * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools
- * notation, or a combination of the above
- * @param rodIntervals a list of ROD intervals to add to the returned set. Can be empty or null.
+ * @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above
+ * @param rule interval merging rule
* @return A sorted, merged list of all intervals specified in this arg list.
*/
- protected GenomeLocSortedSet loadIntervals( List argList, List rodIntervals ) {
+ protected GenomeLocSortedSet loadIntervals( List> argList, IntervalSetRule rule ) {
- boolean allowEmptyIntervalList = (argCollection.unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST ||
- argCollection.unsafe == ValidationExclusion.TYPE.ALL);
+ List allIntervals = new ArrayList(0);
+ for ( IntervalBinding intervalBinding : argList ) {
+ List intervals = intervalBinding.getIntervals(this);
- List nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList);
- List allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule);
+ if ( intervals.isEmpty() ) {
+ logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
+ }
+
+ allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule);
+ }
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging);
}
- /**
- * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
- * @return ROD intervals as GenomeLocs
- */
- private List getRODIntervals() {
- Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources);
- // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
- List ret = new ArrayList();
- if (rodNames != null && argCollection.RODToInterval != null) {
- String rodName = argCollection.RODToInterval;
-
- // check to make sure we have a rod of that name
- if (!rodNames.containsKey(rodName))
- throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
-
- for (String str : rodNames.keySet())
- if (str.equals(rodName)) {
- logger.info("Adding interval list from track (ROD) named " + rodName);
- RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str));
- ret.addAll(intervalGenerator.toGenomeLocList());
- }
- }
- return ret;
- }
-
/**
* Add additional, externally managed IO streams for inputs.
*
@@ -692,12 +665,22 @@ public class GenomeAnalysisEngine {
for (ReadFilter filter : filters)
filter.initialize(this);
- sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
-
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe);
}
+ /**
+ * Entry-point function to initialize the samples database from input data and pedigree arguments
+ */
+ private void initializeSampleDB() {
+ SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType);
+ sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader());
+ sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this));
+ sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles);
+ sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings);
+ sampleDB = sampleDBBuilder.getFinalSampleDB();
+ }
+
/**
* Gets a unique identifier for the reader sourcing this read.
* @param read Read to examine.
@@ -716,106 +699,13 @@ public class GenomeAnalysisEngine {
return getReadsDataSource().getSAMFile(id);
}
- /**
- * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
- * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
- * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
- * found in the corresponding bam file.
- *
- * @return Sets of samples in the merged input SAM stream, grouped by readers
- */
- public List> getSamplesByReaders() {
- Collection readers = getReadsDataSource().getReaderIDs();
-
- List> sample_sets = new ArrayList>(readers.size());
-
- for (SAMReaderID r : readers) {
-
- Set samples = new HashSet(1);
- sample_sets.add(samples);
-
- for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) {
- samples.add(g.getSample());
- }
- }
-
- return sample_sets;
-
- }
-
- /**
- * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
- * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
- * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
- * found in the corresponding bam file.
- *
- * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers
- */
- public List> getLibrariesByReaders() {
-
-
- Collection readers = getReadsDataSource().getReaderIDs();
-
- List> lib_sets = new ArrayList>(readers.size());
-
- for (SAMReaderID r : readers) {
-
- Set libs = new HashSet(2);
- lib_sets.add(libs);
-
- for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) {
- libs.add(g.getLibrary());
- }
- }
-
- return lib_sets;
-
- }
-
- /**
- * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD ****
- *
- * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
- * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
- * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
- * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
- *
- * @return sets of (merged) read group ids in order of input bams
- */
- public List> getMergedReadGroupsByReaders() {
-
-
- Collection readers = getReadsDataSource().getReaderIDs();
-
- List> rg_sets = new ArrayList>(readers.size());
-
- for (SAMReaderID r : readers) {
-
- Set groups = new HashSet(5);
- rg_sets.add(groups);
-
- for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) {
- if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
- // use HeaderMerger to translate original read group id from the reader into the read group id in the
- // merged stream, and save that remapped read group id to associate it with specific reader
- groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId()));
- } else {
- // otherwise, pass through the unmapped read groups since this is what Picard does as well
- groups.add(g.getReadGroupId());
- }
- }
- }
-
- return rg_sets;
-
- }
-
/**
* Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available).
*
* @param reads Reads data source.
* @param reference Reference data source.
* @param rods a collection of the reference ordered data tracks
+ * @param manager manager
*/
private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods, RMDTrackBuilder manager) {
if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null )
@@ -844,15 +734,22 @@ public class GenomeAnalysisEngine {
/**
* Gets a data source for the given set of reads.
*
+ * @param argCollection arguments
+ * @param genomeLocParser parser
+ * @param refReader reader
* @return A data source for the given set of reads.
*/
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
DownsamplingMethod method = getDownsamplingMethod();
+ // Synchronize the method back into the collection so that it shows up when
+ // interrogating for the downsample method during command line recreation.
+ setDownsamplingMethod(method);
+
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
- SAMDataSource dataSource = new SAMDataSource(
+ return new SAMDataSource(
samReaderIDs,
genomeLocParser,
argCollection.useOriginalBaseQualities,
@@ -868,14 +765,12 @@ public class GenomeAnalysisEngine {
refReader,
argCollection.defaultBaseQualities,
!argCollection.disableLowMemorySharding);
- return dataSource;
}
/**
* Opens a reference sequence file paired with an index. Only public for testing purposes
*
* @param refFile Handle to a reference sequence file. Non-null.
- * @return A thread-safe file wrapper.
*/
public void setReferenceDataSource(File refFile) {
this.referenceDataSource = new ReferenceDataSource(refFile);
@@ -929,6 +824,26 @@ public class GenomeAnalysisEngine {
return readsDataSource.getHeader(reader);
}
+ /**
+ * Returns an ordered list of the unmerged SAM file headers known to this engine.
+ * @return list of header for each input SAM file, in command line order
+ */
+ public List getSAMFileHeaders() {
+ final List headers = new ArrayList();
+ for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) {
+ headers.add(getReadsDataSource().getHeader(id));
+ }
+ return headers;
+ }
+
+ /**
+ * Gets the master sequence dictionary for this GATK engine instance
+ * @return a never-null dictionary listing all of the contigs known to this engine instance
+ */
+ public SAMSequenceDictionary getMasterSequenceDictionary() {
+ return getReferenceDataSource().getReference().getSequenceDictionary();
+ }
+
/**
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
@@ -939,8 +854,6 @@ public class GenomeAnalysisEngine {
return this.readsDataSource;
}
-
-
/**
* Sets the collection of GATK main application arguments.
*
@@ -1027,140 +940,14 @@ public class GenomeAnalysisEngine {
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
}
- public SampleDataSource getSampleMetadata() {
- return this.sampleDataSource;
- }
+ // -------------------------------------------------------------------------------------
+ //
+ // code for working with Samples database
+ //
+ // -------------------------------------------------------------------------------------
- /**
- * Get a sample by its ID
- * If an alias is passed in, return the main sample object
- * @param id sample id
- * @return sample Object with this ID
- */
- public Sample getSampleById(String id) {
- return sampleDataSource.getSampleById(id);
- }
-
- /**
- * Get the sample for a given read group
- * Must first look up ID for read group
- * @param readGroup of sample
- * @return sample object with ID from the read group
- */
- public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
- return sampleDataSource.getSampleByReadGroup(readGroup);
- }
-
- /**
- * Get a sample for a given read
- * Must first look up read group, and then sample ID for that read group
- * @param read of sample
- * @return sample object of this read
- */
- public Sample getSampleByRead(SAMRecord read) {
- return getSampleByReadGroup(read.getReadGroup());
- }
-
- /**
- * Get number of sample objects
- * @return size of samples map
- */
- public int sampleCount() {
- return sampleDataSource.sampleCount();
- }
-
- /**
- * Return all samples with a given family ID
- * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
- * @param familyId family ID
- * @return Samples with the given family ID
- */
- public Set getFamily(String familyId) {
- return sampleDataSource.getFamily(familyId);
- }
-
- /**
- * Returns all children of a given sample
- * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
- * @param sample parent sample
- * @return children of the given sample
- */
- public Set getChildren(Sample sample) {
- return sampleDataSource.getChildren(sample);
- }
-
- /**
- * Gets all the samples
- * @return
- */
- public Collection getSamples() {
- return sampleDataSource.getSamples();
- }
-
- /**
- * Takes a list of sample names and returns their corresponding sample objects
- *
- * @param sampleNameList List of sample names
- * @return Corresponding set of samples
- */
- public Set getSamples(Collection sampleNameList) {
- return sampleDataSource.getSamples(sampleNameList);
- }
-
-
- /**
- * Returns a set of samples that have any value (which could be null) for a given property
- * @param key Property key
- * @return Set of samples with the property
- */
- public Set getSamplesWithProperty(String key) {
- return sampleDataSource.getSamplesWithProperty(key);
- }
-
- /**
- * Returns a set of samples that have a property with a certain value
- * Value must be a string for now - could add a similar method for matching any objects in the future
- *
- * @param key Property key
- * @param value String property value
- * @return Set of samples that match key and value
- */
- public Set getSamplesWithProperty(String key, String value) {
- return sampleDataSource.getSamplesWithProperty(key, value);
-
- }
-
- /**
- * Returns a set of sample objects for the sample names in a variant context
- *
- * @param context Any variant context
- * @return a set of the sample objects
- */
- public Set getSamplesByVariantContext(VariantContext context) {
- Set samples = new HashSet();
- for (String sampleName : context.getSampleNames()) {
- samples.add(sampleDataSource.getOrCreateSample(sampleName));
- }
- return samples;
- }
-
- /**
- * Returns all samples that were referenced in the SAM file
- */
- public Set getSAMFileSamples() {
- return sampleDataSource.getSAMFileSamples();
- }
-
- /**
- * Return a subcontext restricted to samples with a given property key/value
- * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
- * @param context VariantContext to filter
- * @param key property key
- * @param value property value (must be string)
- * @return subcontext
- */
- public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
- return sampleDataSource.subContextFromSampleProperty(context, key, value);
+ public SampleDB getSampleDB() {
+ return this.sampleDB;
}
public Map getApproximateCommandLineArguments(Object... argumentProviders) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java
index 7cb615f7f..ceaa30f01 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java
@@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
+import java.util.TreeMap;
/**
* Holds a bunch of basic information about the traversal.
@@ -102,8 +103,12 @@ public class ReadMetrics implements Cloneable {
counter.put(filter.getClass(), c + 1L);
}
- public Map getCountsByFilter() {
- return Collections.unmodifiableMap(counter);
+ public Map getCountsByFilter() {
+ final TreeMap sortedCounts = new TreeMap();
+ for(Map.Entry counterEntry: counter.entrySet()) {
+ sortedCounts.put(counterEntry.getKey().getSimpleName(),counterEntry.getValue());
+ }
+ return sortedCounts;
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
index 2f4dd06e2..e0c2ce72a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
@@ -29,13 +29,11 @@ package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.simpleframework.xml.*;
/**
* @author ebanks
* @version 1.0
*/
-@Root
public class DbsnpArgumentCollection {
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index fd39d46b0..8078a1ea4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -26,34 +26,26 @@
package org.broadinstitute.sting.gatk.arguments;
import net.sf.samtools.SAMFileReader;
+import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
+import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
-import org.simpleframework.xml.*;
-import org.simpleframework.xml.core.Persister;
-import org.simpleframework.xml.stream.Format;
-import org.simpleframework.xml.stream.HyphenStyle;
import java.io.File;
-import java.io.InputStream;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
/**
* @author aaron
* @version 1.0
*/
-@Root
public class GATKArgumentCollection {
/* our version number */
@@ -64,58 +56,58 @@ public class GATKArgumentCollection {
public GATKArgumentCollection() {
}
- @ElementMap(entry = "analysis_argument", key = "key", attribute = true, inline = true, required = false)
public Map walkerArgs = new HashMap();
// parameters and their defaults
- @ElementList(required = false)
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false)
public List samFiles = new ArrayList();
- // parameters and their defaults
- @ElementList(required = false)
- @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false)
- public List sampleFiles = new ArrayList();
-
- @Element(required = false)
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
- @Element(required = false)
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
- @ElementList(required = false)
- @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required = false)
+ @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List readFilters = new ArrayList();
- @ElementList(required = false)
- @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.", required = false)
- public List intervals = null;
+ /**
+ * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
+ * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
+ * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
+ */
+ @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
+ public List> intervals = null;
- @ElementList(required = false)
- @Input(fullName = "excludeIntervals", shortName = "XL", doc = "A list of genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file.", required = false)
- public List excludeIntervals = null;
+ /**
+ * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
+ * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
+ * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
+ */
+ @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
+ public List> excludeIntervals = null;
+
+ /**
+ * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
+ * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
+ */
+ @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
+ public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
+
+ /**
+ * Should abutting (but not overlapping) intervals be treated as separate intervals?
+ */
+ @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
+ public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
- @Element(required = false)
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
@Deprecated
@Hidden
- @ElementList(required = false)
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false)
public ArrayList RODBindings = new ArrayList();
- @Element(required = false)
- @Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false)
- public String RODToInterval = null;
-
- @Element(required = false)
- @Argument(fullName = "BTI_merge_rule", shortName = "BTIMR", doc = "Indicates the merging approach the interval parser should use to combine the BTI track with other -L options", required = false)
- public IntervalSetRule BTIMergeRule = IntervalSetRule.UNION;
-
- @Element(required = false)
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@@ -128,22 +120,19 @@ public class GATKArgumentCollection {
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
- @Element(required = false)
- @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here.", required = false)
+ @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
public DownsampleType downsamplingType = null;
- @Element(required = false)
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
public Double downsampleFraction = null;
- @Element(required = false)
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
public Integer downsampleCoverage = null;
/**
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
- * a default downsampling mechanism, return null.
- * @return The explicitly specified downsampling mechanism, or null if none exists.
+ * a default downsampling mechanism, return the default.
+ * @return The explicitly specified downsampling mechanism, or the default if none exists.
*/
public DownsamplingMethod getDownsamplingMethod() {
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
@@ -153,16 +142,26 @@ public class GATKArgumentCollection {
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
}
+ /**
+ * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments.
+ * @param method The downsampling mechanism.
+ */
+ public void setDownsamplingMethod(DownsamplingMethod method) {
+ if (method == null)
+ throw new IllegalArgumentException("method is null");
+ downsamplingType = method.type;
+ downsampleCoverage = method.toCoverage;
+ downsampleFraction = method.toFraction;
+ }
+
// --------------------------------------------------------------------------------------------------------------
//
// BAQ arguments
//
// --------------------------------------------------------------------------------------------------------------
- @Element(required = false)
@Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false)
public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF;
- @Element(required = false)
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
public double BAQGOP = BAQ.DEFAULT_GOP;
@@ -171,7 +170,6 @@ public class GATKArgumentCollection {
// performance log arguments
//
// --------------------------------------------------------------------------------------------------------------
- @Element(required = false)
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
@@ -184,67 +182,117 @@ public class GATKArgumentCollection {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
}
- @Element(required = false)
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;
- @Element(required = false)
@Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false)
public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT;
- @Element(required = false)
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe;
- /** How many threads should be allocated to this analysis. */
- @Element(required = false)
- @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
+ @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
public int numberOfThreads = 1;
- /** What rule should we use when merging intervals */
- @Element(required = false)
- @Argument(fullName = "interval_merging", shortName = "im", doc = "What interval merging rule should we use.", required = false)
- public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
-
- @ElementList(required = false)
- @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false)
+ @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false)
public List readGroupBlackList = null;
// --------------------------------------------------------------------------------------------------------------
//
- // distributed GATK arguments
+ // PED (pedigree) support
//
// --------------------------------------------------------------------------------------------------------------
- @Element(required=false)
- @Argument(fullName="processingTracker",shortName="C",doc="A lockable, shared file for coordinating distributed GATK runs",required=false)
- @Hidden
- public File processingTrackerFile = null;
- @Element(required=false)
- @Argument(fullName="restartProcessingTracker",shortName="RPT",doc="Should we delete the processing tracker file at startup?",required=false)
- @Hidden
- public boolean restartProcessingTracker = false;
+ /**
+ * Reads PED file-formatted tabular text files describing meta-data about the samples being
+ * processed in the GATK.
+ *
+ *
+ *
+ * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:
+ *
+ *
+ * - Family ID
+ * - Individual ID
+ * - Paternal ID
+ * - Maternal ID
+ * - Sex (1=male; 2=female; other=unknown)
+ * - Phenotype
+ *
+ *
+ * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person.
+ * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a
+ * quantitative trait or an affection status column: GATK will automatically detect which type
+ * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).
+ *
+ * If an individual's sex is unknown, then any character other than 1 or 2 can be used.
+ *
+ * You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that
+ * line will be ignored. Do not start any family IDs with this character therefore.
+ *
+ * Affection status should be coded:
+ *
+ *
+ * - -9 missing
+ * - 0 missing
+ * - 1 unaffected
+ * - 2 affected
+ *
+ *
+ * If any value outside of -9,0,1,2 is detected than the samples are assumed
+ * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely
+ * represents the missing value.
+ *
+ * Genotypes (column 7 onwards) cannot be specified to the GATK.
+ *
+ * For example, here are two individuals (one row = one person):
+ *
+ *
+ * FAM001 1 0 0 1 2
+ * FAM001 2 0 0 1 2
+ *
+ *
+ * Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to
+ * tell the GATK PED parser that the corresponding fields are missing from the ped file.
+ *
+ * Note that most GATK walkers do not use pedigree information. Walkers that require pedigree
+ * data should clearly indicate so in their arguments and will throw errors if required pedigree
+ * information is missing.
+ */
+ @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false)
+ public List pedigreeFiles = Collections.emptyList();
- @Element(required=false)
- @Argument(fullName="processingTrackerStatusFile",shortName="CSF",doc="If provided, a detailed accounting of the state of the process tracker is written to this file. For debugging, only",required=false)
- @Hidden
- public File processingTrackerStatusFile = null;
+ /**
+ * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more
+ * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString
+ * as -ped supports
+ */
+ @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false)
+ public List pedigreeStrings = Collections.emptyList();
- @Element(required=false)
- @Argument(fullName="processingTrackerID",shortName="CID",doc="If provided, an integer ID (starting at 1) indicating a unique id for this process within the distributed GATK group",required=false)
- @Hidden
- public int processTrackerID = -1;
+ /**
+ * How strict should we be in parsing the PED files?
+ */
+ @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false)
+ public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT;
+
+ // --------------------------------------------------------------------------------------------------------------
+ //
+ // BAM indexing and sharding arguments
+ //
+ // --------------------------------------------------------------------------------------------------------------
- @Element(required = false)
@Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false)
@Hidden
public boolean allowIntervalsWithUnindexedBAM = false;
- @Element(required = false)
- @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality.",required=false)
+ @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
public boolean disableLowMemorySharding = false;
// --------------------------------------------------------------------------------------------------------------
@@ -253,69 +301,6 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
- /**
- * marshal the data out to a object
- *
- * @param collection the GATKArgumentCollection to load into
- * @param outputFile the file to write to
- */
- public static void marshal(GATKArgumentCollection collection, String outputFile) {
- Serializer serializer = new Persister(new Format(new HyphenStyle()));
- File result = new File(outputFile);
- try {
- serializer.write(collection, result);
- } catch (Exception e) {
- throw new ReviewedStingException("Failed to marshal the data to the file " + outputFile, e);
- }
- }
-
- /**
- * marshal the data out to a object
- *
- * @param collection the GATKArgumentCollection to load into
- * @param outputFile the stream to write to
- */
- public static void marshal(GATKArgumentCollection collection, PrintStream outputFile) {
- Serializer serializer = new Persister(new Format(new HyphenStyle()));
- try {
- serializer.write(collection, outputFile);
- } catch (Exception e) {
- throw new ReviewedStingException("Failed to marshal the data to the file " + outputFile, e);
- }
- }
-
- /**
- * unmashall the object from a configuration file
- *
- * @param filename the filename to marshal from
- */
- public static GATKArgumentCollection unmarshal(String filename) {
- Serializer serializer = new Persister(new Format(new HyphenStyle()));
- File source = new File(filename);
- try {
- GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, source);
- return example;
- } catch (Exception e) {
- throw new ReviewedStingException("Failed to marshal the data from file " + filename, e);
- }
- }
-
- /**
- * unmashall the object from a configuration file
- *
- * @param file the inputstream to marshal from
- */
- public static GATKArgumentCollection unmarshal(InputStream file) {
- Serializer serializer = new Persister(new Format(new HyphenStyle()));
- try {
- GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, file);
- return example;
- } catch (Exception e) {
- throw new ReviewedStingException("Failed to marshal the data from file " + file.toString(), e);
- }
- }
-
-
/**
* test equality between two arg collections. This function defines the statement:
* "not fun to write"
@@ -363,7 +348,7 @@ public class GATKArgumentCollection {
if (!other.referenceFile.equals(this.referenceFile)) {
return false;
}
- if (!other.intervals.equals(this.intervals)) {
+ if ((other.intervals == null && this.intervals != null) || !other.intervals.equals(this.intervals)) {
return false;
}
if (!other.excludeIntervals.equals(this.excludeIntervals)) {
@@ -386,39 +371,21 @@ public class GATKArgumentCollection {
if (other.intervalMerging != this.intervalMerging) {
return false;
}
- if ((other.RODToInterval == null && RODToInterval != null) ||
- (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
- return false;
- }
if (other.phoneHomeType != this.phoneHomeType) {
return false;
}
- if (BTIMergeRule != other.BTIMergeRule)
+ if (intervalSetRule != other.intervalSetRule)
return false;
- if ( BAQMode != other.BAQMode) return false;
+ if ( BAQMode != other.BAQMode ) return false;
if ( BAQGOP != other.BAQGOP ) return false;
if ((other.performanceLog == null && this.performanceLog != null) ||
(other.performanceLog != null && !other.performanceLog.equals(this.performanceLog)))
return false;
- if ((other.processingTrackerFile == null && this.processingTrackerFile != null) ||
- (other.processingTrackerFile != null && !other.processingTrackerFile.equals(this.processingTrackerFile)))
- return false;
-
- if ((other.processingTrackerStatusFile == null && this.processingTrackerStatusFile != null) ||
- (other.processingTrackerStatusFile != null && !other.processingTrackerStatusFile.equals(this.processingTrackerStatusFile)))
- return false;
-
- if ( restartProcessingTracker != other.restartProcessingTracker )
- return false;
-
- if ( processTrackerID != other.processTrackerID )
- return false;
-
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
return false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java
index 654770fe7..4c0257e6a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java
@@ -28,13 +28,11 @@ package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.simpleframework.xml.Root;
/**
* @author ebanks
* @version 1.0
*/
-@Root
public class StandardVariantContextInputArgumentCollection {
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java
index 0d5a23f1d..577f7929a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java
@@ -37,7 +37,6 @@ public class ValidationExclusion {
public enum TYPE {
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
- ALLOW_EMPTY_INTERVAL_LIST, // allow the user to pass in an empty interval list
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java
index 17e4a0743..57416d111 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java
@@ -25,12 +25,12 @@
package org.broadinstitute.sting.gatk.contexts;
-import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.List;
@@ -130,7 +130,7 @@ public class AlignmentContext implements HasGenomeLocation {
*/
@Deprecated
//todo: unsafe and tailored for current usage only; both pileups can be null or worse, bot can be not null in theory
- public List getReads() { return ( basePileup.getReads() ); }
+ public List getReads() { return ( basePileup.getReads() ); }
/**
* Are there any reads associated with this locus?
@@ -138,7 +138,7 @@ public class AlignmentContext implements HasGenomeLocation {
* @return
*/
public boolean hasReads() {
- return basePileup != null && basePileup.size() > 0 ;
+ return basePileup != null && basePileup.getNumberOfElements() > 0 ;
}
/**
@@ -146,7 +146,7 @@ public class AlignmentContext implements HasGenomeLocation {
* @return
*/
public int size() {
- return basePileup.size();
+ return basePileup.getNumberOfElements();
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
index 1f9a7d705..4e75f3ddb 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
@@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.contexts;
import net.sf.samtools.SAMReadGroupRecord;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -76,14 +75,6 @@ public class AlignmentContextUtils {
return splitContextBySampleName(context, null);
}
- public static Map splitContextBySample(AlignmentContext context) {
- Map m = new HashMap();
- for ( Map.Entry entry : splitContextBySampleName(context, null).entrySet() ) {
- m.put(new Sample(entry.getKey()), entry.getValue());
- }
- return m;
- }
-
/**
* Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead
* of sample object.
@@ -97,11 +88,11 @@ public class AlignmentContextUtils {
GenomeLoc loc = context.getLocation();
HashMap contexts = new HashMap();
- for(String sample: context.getPileup().getSampleNames()) {
- ReadBackedPileup pileupBySample = context.getPileup().getPileupForSampleName(sample);
+ for(String sample: context.getPileup().getSamples()) {
+ ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample);
// Don't add empty pileups to the split context.
- if(pileupBySample.size() == 0)
+ if(pileupBySample.getNumberOfElements() == 0)
continue;
if(sample != null)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java
index e92599494..a6731ee18 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java
@@ -1,10 +1,10 @@
package org.broadinstitute.sting.gatk.datasources.providers;
-import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.GenomeLocusIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Collections;
import java.util.List;
@@ -132,7 +132,7 @@ public class AllLocusView extends LocusView {
* @param site Site at which to create the blank locus context.
* @return empty context.
*/
- private final static List EMPTY_PILEUP_READS = Collections.emptyList();
+ private final static List EMPTY_PILEUP_READS = Collections.emptyList();
private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList();
private AlignmentContext createEmptyLocus( GenomeLoc site ) {
return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
index ba6321121..bf5f33dc3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java
@@ -59,7 +59,7 @@ public class LowMemoryIntervalSharder implements Iterator {
*/
public FilePointer next() {
FilePointer current = wrappedIterator.next();
- while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0)
+ while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
current = current.combine(parser,wrappedIterator.next());
return current;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index 572970349..8452aadfd 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -43,6 +43,7 @@ import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File;
import java.lang.reflect.InvocationTargetException;
@@ -57,6 +58,8 @@ import java.util.*;
* Converts shards to SAM iterators over the specified region
*/
public class SAMDataSource {
+ final private static GATKSamRecordFactory factory = new GATKSamRecordFactory();
+
/** Backing support for reads. */
protected final ReadProperties readProperties;
@@ -235,6 +238,12 @@ public class SAMDataSource {
for(SAMFileReader reader: readers.values()) {
// Get the sort order, forcing it to coordinate if unsorted.
SAMFileHeader header = reader.getFileHeader();
+
+ if ( header.getReadGroups().isEmpty() ) {
+ throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
+ "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
+ }
+
SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate;
// Validate that all input files are sorted in the same order.
@@ -638,7 +647,9 @@ public class SAMDataSource {
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
byte defaultBaseQualities) {
- wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
+ if ( useOriginalBaseQualities || defaultBaseQualities >= 0 )
+ // only wrap if we are replacing the original qualitiies or using a default base quality
+ wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
@@ -750,6 +761,7 @@ public class SAMDataSource {
public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) {
for(SAMReaderID readerID: readerIDs) {
SAMFileReader reader = new SAMFileReader(readerID.samFile);
+ reader.setSAMRecordFactory(factory);
reader.enableFileSource(true);
reader.enableIndexMemoryMapping(false);
if(!enableLowMemorySharding)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java
index 24d8bc6c5..673df6dfa 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java
@@ -97,7 +97,7 @@ public class FindLargeShards extends CommandLineProgram {
// intervals
GenomeLocSortedSet intervalSortedSet = null;
if(intervals != null)
- intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals, true), IntervalMergingRule.ALL);
+ intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL);
else {
intervalSortedSet = new GenomeLocSortedSet(genomeLocParser);
for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences())
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java
deleted file mode 100644
index 433e0af40..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 12, 2010
- * Time: 2:09:16 PM
- */
-public class PropertyDefinition {
-
- String property;
-
- String[] values;
-
- public String getProperty() {
- return property;
- }
-
- public void setProperty(String property) {
- this.property = property;
- }
-
- public String[] getValues() {
- return values;
- }
-
- public void setValues(String[] values) {
- this.values = values;
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java
deleted file mode 100644
index ca8756684..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java
+++ /dev/null
@@ -1,203 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-
-import org.broadinstitute.sting.utils.exceptions.StingException;
-
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Jul 26, 2010
- * Time: 3:31:38 PM
- */
-public class Sample implements java.io.Serializable {
-
- private final String id;
-
- private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file
-
- private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file
-
- private HashMap properties = new HashMap();
-
- private HashMap relationships = new HashMap();
-
- public enum Gender {
- MALE,
- FEMALE,
- UNKNOWN
- }
-
- public Sample(String id) {
-/* if (id == null) {
- throw new StingException("Error creating sample: sample ID cannot be null");
- }*/
- this.id = id;
- }
-
- public String getId() {
- return this.id;
- }
-
- public Map getProperties() {
- return properties;
- }
-
- public void setProperties(Map properties) {
- this.properties = (HashMap) properties;
- }
-
- public Map getRelationships() {
- return Collections.unmodifiableMap(this.relationships);
- }
-
- public void setSampleFileEntry(boolean value) {
- this.hasSampleFileEntry = value;
- }
-
- public boolean hasSAMFileEntry() {
- return this.hasSAMFileEntry;
- }
-
- public void setSAMFileEntry(boolean value) {
- this.hasSAMFileEntry = value;
- }
-
- public boolean hasSampleFileEntry() {
- return this.hasSampleFileEntry;
- }
-
- /**
- * Get one property
- * @param key key of property
- * @return value of property as generic object
- */
- public Object getProperty(String key) {
- return properties.get(key);
- }
-
- /**
- * Set a property
- * If property already exists, it is overwritten
- * @param key key of property
- * @param value object to be stored in properties array
- */
- public void setProperty(String key, Object value) {
-
- if (relationships.containsKey(key)) {
- throw new StingException("The same key cannot exist as a property and a relationship");
- }
-
- if (key.equals("gender") && value.getClass() != Gender.class) {
- throw new StingException("'gender' property must be of type Sample.Gender");
- }
-
- if (key.equals("population") && value.getClass() != String.class) {
- throw new StingException("'population' property must be of type String");
- }
-
- properties.put(key, value);
- }
-
- /**
- * Get one relationship
- * @param key of relationship
- * @return Sample object that this relationship points to
- */
- public Sample getRelationship(String key) {
- return relationships.get(key);
- }
-
- /**
- * Set one relationship
- * If already set, it is overwritten
- * @param key key of the relationship
- * @param value Sample object this relationship points to
- */
- public void setRelationship(String key, Sample value) {
- if (properties.containsKey(key)) {
- throw new StingException("The same key cannot exist as a property and a relationship");
- }
- relationships.put(key, value);
- }
-
- /**
- * Get the sample's mother
- * @return sample object with relationship mother, if exists, or null
- */
- public Sample getMother() {
- return getRelationship("mother");
- }
-
- /**
- * Get the sample's father
- * @return sample object with relationship father, if exists, or null
- */
- public Sample getFather() {
- return getRelationship("father");
- }
-
- /**
- * Get gender of the sample
- * @return property of key "gender" - must be of type Gender
- */
- public Gender getGender() {
- return (Gender) properties.get("gender");
- }
-
- public String getPopulation() {
- return (String) properties.get("population");
- }
-
- public String getFamilyId() {
- return (String) properties.get("familyId");
- }
-
- /**
- * @return True if sample is male, false if female, unknown, or null
- */
- public boolean isMale() {
- return properties.get("gender") == Gender.MALE;
- }
-
- /**
- * @return True if sample is female, false if male, unknown or null
- */
- public boolean isFemale() {
- return properties.get("gender") == Gender.MALE;
- }
-
- /**
- *
- * @param key property key
- * @return true if sample has this property (even if its value is null)
- */
- public boolean hasProperty(String key) {
- return properties.containsKey(key);
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
-
- Sample sample = (Sample) o;
-
- if (hasSAMFileEntry != sample.hasSAMFileEntry) return false;
- if (hasSampleFileEntry != sample.hasSampleFileEntry) return false;
- if (id != null ? !id.equals(sample.id) : sample.id != null) return false;
- if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false;
- if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null)
- return false;
-
- return true;
- }
-
- @Override
- public int hashCode() {
- return id != null ? id.hashCode() : "".hashCode();
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java
deleted file mode 100644
index ce749cb83..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 13, 2010
- * Time: 5:13:46 PM
- */
-public class SampleAlias {
-
- String mainId;
-
- String[] otherIds;
-
- public String getMainId() {
- return mainId;
- }
-
- public void setMainId(String mainId) {
- this.mainId = mainId;
- }
-
- public String[] getOtherIds() {
- return otherIds;
- }
-
- public void setOtherIds(String[] otherIds) {
- this.otherIds = otherIds;
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java
deleted file mode 100644
index 067bf3f72..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java
+++ /dev/null
@@ -1,590 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMReadGroupRecord;
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.SampleUtils;
-import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.variantcontext.Genotype;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import org.yaml.snakeyaml.TypeDescription;
-import org.yaml.snakeyaml.Yaml;
-import org.yaml.snakeyaml.constructor.Constructor;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.*;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Jul 26, 2010
- * Time: 3:30:09 PM
- *
- * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included
- * in the GATK by the "--samples" argument. This class reads and parses those files.
- *
- * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really
- * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker
- * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource.
- *
- */
-public class SampleDataSource {
-
- /**
- * SAMFileHeader that has been created for this analysis.
- */
- private SAMFileHeader header;
-
- /**
- * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so
- * this is stored as a HashMap.
- */
- private final HashMap samples = new HashMap();
-
- /**
- * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different
- * datasets. If this is the case, one ID is the "primary ID" and others are "aliases".
- *
- * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases.
- */
- private HashMap sampleAliases = new HashMap();
-
- /**
- * While loading sample files, we must be aware of "special" properties and relationships that are always allowed
- */
- public static final String[] specialProperties = new String[] {"familyId", "population", "gender"};
- public static final String[] specialRelationships = new String[] {"mother", "father"};
-
- /**
- * Constructor takes both a SAM header and sample files because the two must be integrated.
- * @param header SAMFileHeader that has been created for this analysis
- * @param sampleFiles Sample files that were included on the command line
- */
- public SampleDataSource(SAMFileHeader header, List sampleFiles) {
- this();
- this.header = header;
- // create empty sample object for each sample referenced in the SAM header
- for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
- if (!hasSample(sampleName)) {
- Sample newSample = new Sample(sampleName);
- newSample.setSAMFileEntry(true);
- samples.put(sampleName, newSample);
- }
- }
-
- // add files consecutively
- if (sampleFiles != null) {
- for (File file : sampleFiles) {
- addFile(file);
- }
- }
- }
-
- public SampleDataSource() {
- samples.put(null, new Sample(null));
- }
-
- /**
- * Hallucinates sample objects for all the samples in the SAM file and stores them
- */
- public void addSamplesFromSAMHeader(SAMFileHeader header) {
- for (String sampleName : SampleUtils.getSAMFileSamples(header)) {
- if (!hasSample(sampleName)) {
- Sample newSample = new Sample(sampleName);
- newSample.setSAMFileEntry(true);
- samples.put(sampleName, newSample);
- }
- }
- }
-
- /**
- * Parse one sample file and integrate it with samples that are already there
- * Fail quickly if we find any errors in the file
- */
- public void addFile(File sampleFile) {
-
- BufferedReader reader;
- try {
- reader = new BufferedReader(new FileReader(sampleFile));
- }
- catch (IOException e) {
- throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e);
- }
-
- // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file
- Constructor con = new Constructor(SampleFileParser.class);
- TypeDescription desc = new TypeDescription(SampleFileParser.class);
- desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class);
- desc.putListPropertyType("sampleAliases", SampleAlias.class);
- con.addTypeDescription(desc);
- Yaml yaml = new Yaml(con);
-
- // SampleFileParser stores an object representation of a sample file - this is what we'll parse
- SampleFileParser parser;
- try {
- parser = (SampleFileParser) yaml.load(reader);
- }
- catch (Exception e) {
- throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e);
- }
-
- // check to see which validation options were built into the file
- boolean restrictProperties = parser.getAllowedProperties() != null;
- boolean restrictRelationships = parser.getAllowedRelationships() != null;
- boolean restrictPropertyValues = parser.getPropertyDefinitions() != null;
-
- // propertyValues stores the values that are allowed for a given property
- HashMap propertyValues = null;
- if (restrictPropertyValues) {
- propertyValues = new HashMap();
- for (PropertyDefinition def : parser.getPropertyDefinitions()) {
- HashSet set = new HashSet();
- for (String value : def.getValues()) {
- set.add(value);
- }
- propertyValues.put(def.getProperty(), set);
- }
- }
-
- // make sure the aliases are valid
- validateAliases(parser);
-
- // loop through each sample in the file - a SampleParser stores an object that will become a Sample
- for (SampleParser sampleParser : parser.getSamples()) {
-
- try {
- // step 1: add the sample if it doesn't already exist
- Sample sample = getSampleById(sampleParser.getId());
- if (sample == null) {
- sample = new Sample(sampleParser.getId());
- }
- addSample(sample);
- sample.setSampleFileEntry(true);
-
- // step 2: add the properties
- if (sampleParser.getProperties() != null) {
- for (String property : sampleParser.getProperties().keySet()) {
-
- // check that property is allowed
- if (restrictProperties) {
- if (!isPropertyValid(property, parser.getAllowedProperties())) {
- throw new StingException(property + " is an invalid property. It is not included in the list " +
- "of allowed properties.");
- }
- }
-
- // next check that the value is allowed
- if (restrictPropertyValues) {
- if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) {
- throw new StingException("The value of property '" + property + "' is invalid. " +
- "It is not included in the list of allowed values for this property.");
- }
- }
-
- // next check that there isn't already a conflicting property there
- if (sample.getProperty(property) != null &&
- sample.getProperty(property) != sampleParser.getProperties().get(property))
- {
- throw new StingException(property + " is a conflicting property!");
- }
-
- // checks are passed - now add the property!
- saveProperty(sample, property, sampleParser.getProperties().get(property));
- }
- }
-
- // step 3: add the relationships
- if (sampleParser.getRelationships() != null) {
- for (String relationship : sampleParser.getRelationships().keySet()) {
- String relativeId = sampleParser.getRelationships().get(relationship);
- if (relativeId == null) {
- throw new StingException("The relationship cannot be null");
- }
-
- // first check that it's not invalid
- if (restrictRelationships) {
- if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) {
- throw new StingException(relationship + " is an invalid relationship");
- }
- }
-
- // next check that there isn't already a conflicting property there
- if (sample.getRelationship(relationship) != null) {
- if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) {
- throw new StingException(relationship + " is a conflicting relationship!");
- }
- // if the relationship is already set - and consistent with what we're reading now - no need to continue
- else {
- continue;
- }
- }
-
- // checks are passed - now save the relationship
- saveRelationship(sample, relationship, relativeId);
- }
- }
- } catch (Exception e) {
- throw new StingException("An error occurred while loading this sample from the sample file: " +
- sampleParser.getId(), e);
- }
- }
-
- }
-
- private boolean isValueAllowed(String key, Object value, HashMap valuesList) {
-
- // if the property values weren't specified for this property, then any value is okay
- if (!valuesList.containsKey(key)) {
- return true;
- }
-
- // if this property has enumerated values, it must be a string
- else if (value.getClass() != String.class)
- return false;
-
- // is the value specified or not?
- else if (!valuesList.get(key).contains(value))
- return false;
-
- return true;
- }
-
- /**
- * Makes sure that the aliases are valid
- * Checks that 1) no string is used as both a main ID and an alias;
- * 2) no alias is used more than once
- * @param parser
- */
- private void validateAliases(SampleFileParser parser) {
-
- // no aliases sure validate
- if (parser.getSampleAliases() == null)
- return;
-
- HashSet mainIds = new HashSet();
- HashSet otherIds = new HashSet();
-
- for (SampleAlias sampleAlias : parser.getSampleAliases()) {
- mainIds.add(sampleAlias.getMainId());
- for (String otherId : sampleAlias.getOtherIds()) {
- if (mainIds.contains(otherId))
- throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " +
- "be both a main ID and an other ID", otherId));
-
- if (!otherIds.add(otherId))
- throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " +
- "alias more than once.", otherId));
- }
- }
- }
-
- private boolean isPropertyValid(String property, String[] allowedProperties) {
-
- // is it a special property that is always allowed?
- for (String allowedProperty : specialProperties) {
- if (property.equals(allowedProperty))
- return true;
- }
-
- // is it in the allowed properties list?
- for (String allowedProperty : allowedProperties) {
- if (property.equals(allowedProperty))
- return true;
- }
-
- return false;
- }
-
- private boolean isRelationshipValid(String relationship, String[] allowedRelationships) {
-
- // is it a special relationship that is always allowed?
- for (String allowedRelationship : specialRelationships) {
- if (relationship.equals(allowedRelationship))
- return true;
- }
-
- // is it in the allowed properties list?
- for (String allowedRelationship : allowedRelationships) {
- if (relationship.equals(allowedRelationship))
- return true;
- }
-
- return false;
- }
-
- /**
- * Saves a property as the correct type
- * @param key property key
- * @param value property value, as read from YAML parser
- * @return property value to be stored
- */
- private void saveProperty(Sample sample, String key, Object value) {
-
- // convert gender to the right type, if it was stored as a String
- if (key.equals("gender")) {
- if (((String) value).toLowerCase().equals("male")) {
- value = Sample.Gender.MALE;
- }
- else if (((String) value).toLowerCase().equals("female")) {
- value = Sample.Gender.FEMALE;
- }
- else if (((String) value).toLowerCase().equals("unknown")) {
- value = Sample.Gender.UNKNOWN;
- }
- else if (value != null) {
- throw new StingException("'gender' property must be male, female, or unknown.");
- }
- }
- try {
- sample.setProperty(key, value);
- }
- catch (Exception e) {
- throw new StingException("Could not save property " + key, e);
- }
- }
-
- /**
- * Saves a relationship as the correct type
- * @param key relationship key
- * @param relativeId sample ID string of the relative
- * @return relationship value to be stored
- */
- private void saveRelationship(Sample sample, String key, String relativeId) {
-
- // get the reference that we'll store as the value
- Sample relative = getSampleById(relativeId);
-
- // create sample object for the relative, if necessary
- if (relative == null) {
- relative = new Sample(relativeId);
- addSample(relative);
- }
- sample.setRelationship(key, relative);
- }
-
-
-
- /**
- * Filter a sample name in case it is an alias
- * @param sampleId to be filtered
- * @return ID of sample that stores data for this alias
- */
- private String aliasFilter(String sampleId) {
- if (!sampleAliases.containsKey(sampleId))
- return sampleId;
- else
- return sampleAliases.get(sampleId);
- }
-
- /**
- * Add a sample to the collection
- * @param sample to be added
- */
- private void addSample(Sample sample) {
- samples.put(sample.getId(), sample);
- }
-
- /**
- * Check if sample with this ID exists
- * Note that this will return true if name passed in is an alias
- * @param id ID of sample to be checked
- * @return true if sample exists; false if not
- */
- public boolean hasSample(String id) {
- return samples.get(aliasFilter(id)) != null;
- }
-
- /**
- * Get a sample by its ID
- * If an alias is passed in, return the main sample object
- * @param id
- * @return sample Object with this ID
- */
- public Sample getSampleById(String id) {
- return samples.get(aliasFilter(id));
- }
-
- /**
- * Get the sample for a given read group
- * Must first look up ID for read group
- * @param readGroup of sample
- * @return sample object with ID from the read group
- */
- public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) {
- String nameFromReadGroup = readGroup.getSample();
- return getSampleById(nameFromReadGroup);
- }
-
- /**
- * Get a sample for a given read
- * Must first look up read group, and then sample ID for that read group
- * @param read of sample
- * @return sample object of this read
- */
- public Sample getSampleByRead(SAMRecord read) {
- return getSampleByReadGroup(read.getReadGroup());
- }
-
- /**
- * Get number of sample objects
- * @return size of samples map
- */
- public int sampleCount() {
- return samples.size();
- }
-
- /**
- * Return all samples with a given family ID
- * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this
- * @param familyId
- * @return
- */
- public Set getFamily(String familyId) {
- HashSet familyMembers = new HashSet();
-
- for (Sample sample : samples.values()) {
- if (sample.getFamilyId() != null) {
- if (sample.getFamilyId().equals(familyId))
- familyMembers.add(sample);
- }
- }
- return familyMembers;
- }
-
- /**
- * Returns all children of a given sample
- * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient
- * @param sample
- * @return
- */
- public Set getChildren(Sample sample) {
- HashSet children = new HashSet();
- for (Sample familyMember : getFamily(sample.getFamilyId())) {
- if (familyMember.getMother() == sample || familyMember.getFather() == sample) {
- children.add(familyMember);
- }
- }
- return children;
- }
-
- public Set getSamples() {
- HashSet set = new HashSet();
- set.addAll(samples.values());
- return set;
- }
-
- /**
- * Takes a collection of sample names and returns their corresponding sample objects
- * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set
- * @param sampleNameList Set of sample names
- * @return Corresponding set of samples
- */
- public Set getSamples(Collection sampleNameList) {
- HashSet samples = new HashSet();
- for (String name : sampleNameList) {
- try {
- samples.add(getSampleById(name));
- }
- catch (Exception e) {
- throw new StingException("Could not get sample with the following ID: " + name, e);
- }
- }
- return samples;
- }
-
- /**
- * Returns a set of samples that have any value (which could be null) for a given property
- * @param key Property key
- * @return Set of samples with the property
- */
- public Set getSamplesWithProperty(String key) {
- HashSet toReturn = new HashSet();
- for (Sample s : samples.values()) {
- if (s.hasProperty(key))
- toReturn.add(s);
- }
- return toReturn;
- }
-
- /**
- * Returns a set of samples that have a property with a certain value
- * Value must be a string for now - could add a similar method for matching any objects in the future
- *
- * @param key Property key
- * @param value String property value
- * @return Set of samples that match key and value
- */
- public Set getSamplesWithProperty(String key, String value) {
- Set toReturn = getSamplesWithProperty(key);
- for (Sample s : toReturn) {
- if (!s.getProperty(key).equals(value))
- toReturn.remove(s);
- }
- return toReturn;
- }
-
- public Sample getOrCreateSample(String id) {
- Sample sample = getSampleById(id);
- if (sample == null) {
- sample = new Sample(id);
- addSample(sample);
- }
- return sample;
- }
-
- /**
- * Returns all samples that were referenced in the SAM file
- */
- public Set getSAMFileSamples() {
- Set toReturn = new HashSet();
- for (Sample sample : samples.values()) {
- if (sample.hasSAMFileEntry())
- toReturn.add(sample);
- }
- return toReturn;
- }
-
- /**
- * Returns a set of sample objects for the sample names in a variant context
- *
- * @param context Any variant context
- * @return a set of the sample objects
- */
- public Set getSamplesByVariantContext(VariantContext context) {
- Set samples = new HashSet();
- for (String sampleName : context.getSampleNames()) {
- samples.add(getOrCreateSample(sampleName));
- }
- return samples;
- }
-
-
- /**
- * Return a subcontext restricted to samples with a given property key/value
- * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
- * @param context VariantContext to filter
- * @param key property key
- * @param value property value (must be string)
- * @return subcontext
- */
- public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
-
- Set samplesWithProperty = new HashSet();
- for (String sampleName : context.getSampleNames()) {
- Sample s = samples.get(sampleName);
- if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value))
- samplesWithProperty.add(sampleName);
- }
- Map genotypes = context.getGenotypes(samplesWithProperty);
- return context.subContextFromGenotypes(genotypes.values());
- }
-
- public static SampleDataSource createEmptyDataSource() {
- SAMFileHeader header = new SAMFileHeader();
- return new SampleDataSource(header, null);
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java
deleted file mode 100644
index a362af663..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java
+++ /dev/null
@@ -1,65 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 12, 2010
- * Time: 1:30:44 PM
- */
-public class SampleFileParser {
-
- private SampleAlias[] sampleAliases;
-
- private String[] allowedProperties;
-
- private String[] allowedRelationships;
-
- private PropertyDefinition[] propertyDefinitions;
-
- private SampleParser[] samples;
-
- public PropertyDefinition[] getPropertyDefinitions() {
- return propertyDefinitions;
- }
-
- public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) {
- this.propertyDefinitions = propertyDefinitions;
- }
-
- public SampleFileParser() {
-
- }
-
- public String[] getAllowedProperties() {
- return allowedProperties;
- }
-
- public void setAllowedProperties(String[] allowedProperties) {
- this.allowedProperties = allowedProperties;
- }
-
- public SampleParser[] getSamples() {
- return samples;
- }
-
- public void setSamples(SampleParser[] samples) {
- this.samples = samples;
- }
-
- public String[] getAllowedRelationships() {
- return allowedRelationships;
- }
-
- public void setAllowedRelationships(String[] allowedRelationships) {
- this.allowedRelationships = allowedRelationships;
- }
-
- public SampleAlias[] getSampleAliases() {
- return sampleAliases;
- }
-
- public void setSampleAliases(SampleAlias[] sampleAliases) {
- this.sampleAliases = sampleAliases;
- }
-
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java
deleted file mode 100644
index f5e07ca29..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package org.broadinstitute.sting.gatk.datasources.sample;
-
-import java.util.HashMap;
-
-/**
- * Created by IntelliJ IDEA.
- * User: brett
- * Date: Aug 13, 2010
- * Time: 2:09:43 PM
- */
-public class SampleParser {
-
- private String id;
-
- private HashMap properties;
-
- private HashMap relationships;
-
- public String getId() {
- return id;
- }
-
- public void setId(String id) {
- this.id = id;
- }
-
- public HashMap getProperties() {
- return properties;
- }
-
- public void setProperties(HashMap properties) {
- this.properties = properties;
- }
-
- public HashMap getRelationships() {
- return relationships;
- }
-
- public void setRelationships(HashMap relationships) {
- this.relationships = relationships;
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
index ae98874c1..162baed00 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@@ -85,12 +85,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
*/
protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) {
super(engine, walker, reads, reference, rods);
-
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
-
- if (engine.getArguments().processingTrackerFile != null) {
- throw new UserException.BadArgumentValue("-C", "Distributed GATK calculations currently not supported in multi-threaded mode. Complain to Mark depristo@broadinstitute.org to implement and test this code path");
- }
}
public Object execute( Walker walker, ShardStrategy shardStrategy ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index 09ab4bd44..deafcd0cc 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.SampleUtils;
import java.util.Collection;
@@ -56,7 +57,8 @@ public class LinearMicroScheduler extends MicroScheduler {
traversalEngine.startTimersIfNecessary();
if(shard.getShardType() == Shard.ShardType.LOCUS) {
LocusWalker lWalker = (LocusWalker)walker;
- WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata());
+ WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
+ getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods);
Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit());
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
index 2b6488ada..badd39860 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
@@ -62,16 +62,17 @@ public class ShardTraverser implements Callable {
Object accumulator = walker.reduceInit();
LocusWalker lWalker = (LocusWalker)walker;
- WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleMetadata()); // todo: microScheduler.engine is protected - is it okay to user it here?
- ShardDataProvider dataProvider = null;
+ WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),
+ microScheduler.getReadIterator(shard),
+ shard.getGenomeLocs(),
+ microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here?
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
- dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods);
+ final ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods);
accumulator = traversalEngine.traverse( walker, dataProvider, accumulator );
dataProvider.close();
}
- if (dataProvider != null) dataProvider.close();
windowMaker.close();
outputMergeTask = outputTracker.closeStorage();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
index 43ea46002..d1f5d80da 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
@@ -4,7 +4,6 @@ import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
@@ -12,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
@@ -63,17 +63,20 @@ public class WindowMaker implements Iterable, I
* the given intervals.
* @param iterator The data source for this window.
* @param intervals The set of intervals over which to traverse.
- * @param sampleData SampleDataSource that we can reference reads with
+ * @param sampleNames The complete set of sample names in the reads in shard
*/
- public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, SampleDataSource sampleData ) {
+ public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
-
- this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData));
+ this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null;
}
+ public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) {
+ this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
+ }
+
public Iterator iterator() {
return this;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java
index 74deace9a..11bbf9e4c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java
@@ -27,7 +27,9 @@ package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
+import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Filter out malformed reads.
@@ -37,14 +39,25 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
*/
public class MalformedReadFilter extends ReadFilter {
private SAMFileHeader header;
-
+
+ @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false)
+ boolean filterMismatchingBaseAndQuals = false;
+
@Override
public void initialize(GenomeAnalysisEngine engine) {
this.header = engine.getSAMFileHeader();
}
public boolean filterOut(SAMRecord read) {
- return !checkInvalidAlignmentStart(read) ||
+ // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided
+ if (!checkMismatchingBasesAndQuals(read)) {
+ if (!filterMismatchingBaseAndQuals)
+ throw new UserException.MalformedBAM(read, "BAM file has a read with mismatching number of bases and base qualities. Offender: " + read.getReadName() +" [" + read.getReadLength() + " bases] [" +read.getBaseQualities().length +"] quals");
+ else
+ return true;
+ }
+
+ return !checkInvalidAlignmentStart(read) ||
!checkInvalidAlignmentEnd(read) ||
!checkAlignmentDisagreesWithHeader(this.header,read) ||
!checkCigarDisagreesWithAlignment(read);
@@ -108,4 +121,13 @@ public class MalformedReadFilter extends ReadFilter {
return false;
return true;
}
+
+ /**
+ * Check if the read has the same number of bases and base qualities
+ * @param read the read to validate
+ * @return true if they have the same number. False otherwise.
+ */
+ private static boolean checkMismatchingBasesAndQuals(SAMRecord read) {
+ return (read.getReadLength() == read.getBaseQualities().length);
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java
new file mode 100755
index 000000000..a56af56d1
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java
@@ -0,0 +1,23 @@
+package org.broadinstitute.sting.gatk.filters;
+
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.commandline.Argument;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: chartl
+ * Date: 9/19/11
+ * Time: 4:09 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ReadNameFilter extends ReadFilter {
+ @Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true)
+ private String readName;
+
+ public boolean filterOut(final SAMRecord rec) {
+ return ! rec.getReadName().equals(readName);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java
index ebb4cbe66..4ca7b935f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java
@@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage, VCFWriter {
else if ( stub.getOutputStream() != null ) {
this.file = null;
this.stream = stub.getOutputStream();
- writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
+ writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
}
else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
@@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage, VCFWriter {
}
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
- return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
+ return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java
index 936243f9d..82cb43634 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java
@@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.io.stubs;
+import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub, VCFWriter {
return isCompressed;
}
+ /**
+ * Gets the master sequence dictionary from the engine associated with this stub
+ * @link GenomeAnalysisEngine.getMasterSequenceDictionary
+ * @return
+ */
+ public SAMSequenceDictionary getMasterSequenceDictionary() {
+ return engine.getMasterSequenceDictionary();
+ }
+
/**
* Should we tell the VCF writer not to write genotypes?
* @return true if the writer should not write genotypes.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
index e13c5a764..ee3ea63eb 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
@@ -35,26 +35,23 @@ import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.datasources.sample.Sample;
-import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.ReservoirDownsampler;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/** Iterator that traverses a SAM File, accumulating information on a per-locus basis */
public class LocusIteratorByState extends LocusIterator {
-// private static long discarded_bases = 0L;
-// private static long observed_bases = 0L;
-
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
@@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator {
* Used to create new GenomeLocs.
*/
private final GenomeLocParser genomeLocParser;
- private final ArrayList samples;
+ private final ArrayList samples;
private final ReadStateManager readStates;
static private class SAMRecordState {
@@ -278,15 +275,27 @@ public class LocusIteratorByState extends LocusIterator {
//
// -----------------------------------------------------------------------------------------------------------------
- public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, SampleDataSource sampleData ) {
+ public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) {
this.readInfo = readInformation;
this.genomeLocParser = genomeLocParser;
+ this.samples = new ArrayList(samples);
+ this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
- // get the list of samples
- this.samples = new ArrayList(sampleData.getSamples());
-
- readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod());
-
+ // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
+ // there's no read data. So we need to throw this error only when samIterator.hasNext() is true
+ if ( this.samples.isEmpty() && samIterator.hasNext() ) {
+ throw new IllegalArgumentException("samples list must not be empty");
+ }
+ }
+
+ /**
+ * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
+ * for the system.
+ */
+ public final static Collection sampleListForSAMWithoutReadGroups() {
+ List samples = new ArrayList();
+ samples.add(null);
+ return samples;
}
public Iterator iterator() {
@@ -303,19 +312,6 @@ public class LocusIteratorByState extends LocusIterator {
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
}
- public void printState() {
- for(Sample sample: samples) {
- Iterator iterator = readStates.iterator(sample);
- while(iterator.hasNext()) {
- SAMRecordState state = iterator.next();
- logger.debug(String.format("printState():"));
- SAMRecord read = state.getRead();
- int offset = state.getReadOffset();
- logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString()));
- }
- }
- }
-
private GenomeLoc getLocation() {
return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
}
@@ -355,14 +351,14 @@ public class LocusIteratorByState extends LocusIterator {
// In this case, the subsequent call to next() will emit the normal pileup at the current base
// and shift the position.
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
- Map fullExtendedEventPileup = new HashMap();
+ Map fullExtendedEventPileup = new HashMap();
// get current location on the reference and decrement it by 1: the indels we just stepped over
// are associated with the *previous* reference base
GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1);
boolean hasBeenSampled = false;
- for(Sample sample: samples) {
+ for(final String sample: samples) {
Iterator iterator = readStates.iterator(sample);
List