AggregateMetrics:

- By porting from jython to java now accessible to Queue via automatic extension generation.
- Better handling for problematic sample names by using PicardAggregationUtils.
GATKReportTable looks up keys using arrays instead of dot-separated strings, which is useful when a sample has a period in the name.
CombineVariants has option to suppress the header with the command line, which is now invoked during VCF gathering.
Added SelectHeaders walker for filtering headers for dbGAP submission.
Generated command line for read filters now correctly prefixes the argument name as --read_filter instead of -read_filter.
Latest WholeGenomePipeline.
Other minor cleanup to utility methods.
This commit is contained in:
Khalid Shakir 2012-04-17 11:45:32 -04:00
parent 1a2e92f8db
commit 91cb654791
18 changed files with 1050 additions and 227 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2009 The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -12,7 +12,6 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@ -99,8 +98,13 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Create a new stub given the requested file.
*
* @param engine engine.
* @param genotypeFile file to (ultimately) create.
* @param isCompressed should we compress the output stream?
* @param argumentSources sources.
* @param skipWritingHeader skip writing header.
* @param doNotWriteGenotypes do not write genotypes.
*/
public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) {
this.engine = engine;
@ -114,8 +118,13 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Create a new stub given the requested file.
*
* @param engine engine.
* @param genotypeStream stream to (ultimately) write.
* @param isCompressed should we compress the output stream?
* @param argumentSources sources.
* @param skipWritingHeader skip writing header.
* @param doNotWriteGenotypes do not write genotypes.
*/
public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) {
this.engine = engine;
@ -154,7 +163,7 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
* @return the master sequence dictionary from the engine associated with this stub
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return engine.getMasterSequenceDictionary();
@ -188,22 +197,25 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
vcfHeader = header;
// Check for the command-line argument header line. If not present, add it in.
if ( !skipWritingHeader ) {
VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine();
boolean foundCommandLineHeaderLine = false;
for (VCFHeaderLine line: vcfHeader.getMetaData()) {
if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) )
foundCommandLineHeaderLine = true;
if (!skipWritingHeader && header.isWriteEngineHeaders()) {
if (header.isWriteCommandLine()) {
VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine();
boolean foundCommandLineHeaderLine = false;
for (VCFHeaderLine line: vcfHeader.getMetaData()) {
if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) )
foundCommandLineHeaderLine = true;
}
if ( !foundCommandLineHeaderLine )
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
}
if ( !foundCommandLineHeaderLine )
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
// also put in the reference contig header lines
String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName());
for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() )
vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly));
vcfHeader.addMetaDataLine(new VCFHeaderLine("reference", "file://" + engine.getArguments().referenceFile.getAbsolutePath()));
vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath()));
}
outputTracker.getStorage(this).writeHeader(vcfHeader);
@ -225,7 +237,7 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Gets a string representation of this object.
* @return
* @return a string representation of this object.
*/
@Override
public String toString() {
@ -247,20 +259,20 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
val = String.format("<ID=%s,length=%d,assembly=%s>", contig.getSequenceName(), contig.getSequenceLength(), assembly);
else
val = String.format("<ID=%s,length=%d>", contig.getSequenceName(), contig.getSequenceLength());
return new VCFHeaderLine("contig", val);
return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val);
}
private String getReferenceAssembly(String refPath) {
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
String assembly = null;
if ( refPath.indexOf("b37") != -1 || refPath.indexOf("v37") != -1 )
if (refPath.contains("b37") || refPath.contains("v37"))
assembly = "b37";
else if ( refPath.indexOf("b36") != -1 )
else if (refPath.contains("b36"))
assembly = "b36";
else if ( refPath.indexOf("hg18") != -1 )
else if (refPath.contains("hg18"))
assembly = "hg18";
else if ( refPath.indexOf("hg19") != -1 )
else if (refPath.contains("hg19"))
assembly = "hg19";
return assembly;
}
}
}

View File

@ -250,53 +250,40 @@ public class GATKReportTable {
}
/**
* Returns the first primary key matching the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
*
* @param dottedColumnValues Period concatenated values.
* Returns the first primary key matching the column values.
* Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all"
* @param columnValues column values.
* @return The first primary key matching the column values or throws an exception.
*/
public Object getPrimaryKeyByData(String dottedColumnValues) {
Object key = findPrimaryKey(dottedColumnValues);
public Object getPrimaryKeyByData(Object... columnValues) {
Object key = findPrimaryKeyByData(columnValues);
if (key == null)
throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues);
throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + Arrays.asList(columnValues));
return key;
}
/**
* Returns true if there is at least on row with the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
*
* @param dottedColumnValues Period concatenated values.
* @return true if there is at least one row matching the columns.
*/
public boolean containsPrimaryKey(String dottedColumnValues) {
return findPrimaryKey(dottedColumnValues) != null;
}
/**
* Returns the first primary key matching the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
*
* @param dottedColumnValues Period concatenated values.
* @return The first primary key matching the column values or null.
*/
private Object findPrimaryKey(String dottedColumnValues) {
return findPrimaryKey(dottedColumnValues.split("\\."));
}
/**
* Returns the first primary key matching the column values.
* Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" }
* Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all"
*
* @param columnValues column values.
* @return The first primary key matching the column values.
* @return The first primary key matching the column values or null if the key does not exist.
*/
private Object findPrimaryKey(Object[] columnValues) {
public Object findPrimaryKeyByData(Object... columnValues) {
if (columnValues == null)
throw new NullPointerException("Column values is null");
if (columnValues.length == 0)
throw new IllegalArgumentException("Column values is empty");
int columnCount = columns.size();
for (Object primaryKey : primaryKeyColumn) {
boolean matching = true;
for (int i = 0; matching && i < columnValues.length; i++) {
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i + 1));
// i --> index into columnValues parameter
// j --> index into columns collection
for (int i = 0, j = 0; matching && i < columnValues.length && j < columnCount; j++) {
if (!columns.getByIndex(j).isDisplayable())
continue;
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i));
i++;
}
if (matching)
return primaryKey;
@ -360,8 +347,8 @@ public class GATKReportTable {
* output file), and the format string used to display the data.
*
* @param columnName the name of the column
* @param defaultValue the default value of a blank cell
* @param display if true - the column will be displayed; if false - the column will be hidden
* @param defaultValue if true - the column will be displayed; if false - the column will be hidden
* @param display display the column
* @param format the format string used to display data
*/
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {

View File

@ -157,6 +157,12 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false)
public int minimumN = 1;
/**
* This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs.
*/
@Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false)
public boolean SUPPRESS_COMMAND_LINE_HEADER = false;
@Hidden
@Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false)
public boolean MERGE_INFO_WITH_MAX_AC = false;
@ -183,7 +189,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
if ( SET_KEY != null )
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples));
VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples);
vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER);
vcfWriter.writeHeader(vcfHeader);
if ( vcfWriter instanceof VCFWriterStub) {
sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes();

View File

@ -0,0 +1,250 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.apache.commons.io.FilenameUtils;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.text.ListFileUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.File;
import java.util.*;
/**
* Selects headers from a VCF source.
* <p/>
* <p>
* Often, a VCF containing many headers will need to be subset in order to facilitate certain formatting guidelines.
* SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the
* file (based on a complete header name or a pattern match).
* <p/>
* <h2>Input</h2>
* <p>
* A set of VCFs.
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* A header selected VCF.
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* Select only the FILTER, FORMAT, and INFO headers:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectHeaders \
* --variant input.vcf \
* -o output.vcf \
* -hn FILTER \
* -hn FORMAT \
* -hn INFO
*
* Select only the FILTER, FORMAT, and INFO headers and add in the reference file names:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectHeaders \
* --variant input.vcf \
* -o output.vcf \
* -hn FILTER \
* -hn FORMAT \
* -hn INFO \
* -irn \
* -iln
*
* Select only the FILTER, FORMAT, and INFO headers, plus any headers with SnpEff:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectHeaders \
* --variant input.vcf \
* -o output.vcf \
* -hn FILTER \
* -hn FORMAT \
* -hn INFO \
* -he '.*SnpEff.*'
* </pre>
*/
@SuppressWarnings("unused")
public class SelectHeaders extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter vcfWriter;
@Argument(fullName = "header_name", shortName = "hn", doc = "Include header. Can be specified multiple times", required = false)
public Set<String> headerNames;
@Argument(fullName = "header_expression", shortName = "he", doc = "Regular expression to select many headers from the tracks provided. Can be specified multiple times", required = false)
public Set<String> headerExpressions;
/**
* Note that header exclusion takes precedence over inclusion, so that if a header is in both lists it will be excluded.
*/
@Argument(fullName = "exclude_header_name", shortName = "xl_hn", doc = "Exclude header. Can be specified multiple times", required = false)
public Set<String> XLheaderNames;
/**
* Note that reference inclusion takes precedence over other header matching. If set other reference lines may be excluded but the file name will still be added.
*/
@Argument(fullName = "include_reference_name", shortName = "irn", doc = "If set the reference file name minus the file extension will be added to the headers", required = false)
public boolean includeReference;
/**
* Note that interval name inclusion takes precedence over other header matching. If set other interval lines may be excluded but the intervals will still be added.
*/
@Argument(fullName = "include_interval_names", shortName = "iln", doc = "If set the interval file name minus the file extension, or the command line intervals, will be added to the headers", required = false)
public boolean includeIntervals;
/**
* Note that engine header inclusion takes precedence over other header matching. If set other engine lines may be excluded but the intervals will still be added.
*/
@Hidden // TODO: Determine if others find this valuable and either remove @Hidden or remove -ieh.
@Argument(fullName = "include_engine_headers", shortName = "ieh", doc = "If set the headers normally output by the engine will be added to the headers", required = false)
public boolean includeEngineHeaders;
private static final ListFileUtils.StringConverter<VCFHeaderLine> headerKey = new ListFileUtils.StringConverter<VCFHeaderLine>() {
@Override
public String convert(VCFHeaderLine value) {
return value.getKey();
}
};
/**
* Set up the VCF writer, the header expressions and regexps
*/
@Override
public void initialize() {
// Get list of samples to include in the output
List<String> rodNames = Arrays.asList(variantCollection.variants.getName());
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine(VCFHeader.SOURCE_KEY, "SelectHeaders"));
// Select only the headers requested by name or expression.
headerLines = new LinkedHashSet<VCFHeaderLine>(getSelectedHeaders(headerLines));
// Optionally add in the reference.
if (includeReference && getToolkit().getArguments().referenceFile != null)
headerLines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, FilenameUtils.getBaseName(getToolkit().getArguments().referenceFile.getName())));
// Optionally add in the intervals.
if (includeIntervals && getToolkit().getArguments().intervals != null) {
for (IntervalBinding<Feature> intervalBinding : getToolkit().getArguments().intervals) {
String source = intervalBinding.getSource();
if (source == null)
continue;
File file = new File(source);
if (file.exists()) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
} else {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
}
}
}
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
VCFHeader vcfHeader = new VCFHeader(headerLines, vcfSamples);
vcfHeader.setWriteEngineHeaders(includeEngineHeaders);
vcfWriter.writeHeader(vcfHeader);
}
private Set<VCFHeaderLine> getSelectedHeaders(Set<VCFHeaderLine> headerLines) {
Set<VCFHeaderLine> selectedHeaders = new TreeSet<VCFHeaderLine>();
if (headerNames == null && headerExpressions == null) {
// Include everything if nothing was explicitly included.
selectedHeaders.addAll(headerLines);
} else {
// Only include the selected headers.
if (headerNames != null)
selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerNames, true));
if (headerExpressions != null)
selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerExpressions, false));
}
// Remove any excluded headers.
if (XLheaderNames != null)
selectedHeaders = ListFileUtils.excludeMatching(selectedHeaders, headerKey, XLheaderNames, true);
return selectedHeaders;
}
/**
* Pass through the VC record
*
* @param tracker the ROD tracker
* @param ref reference information
* @param context alignment info
* @return number of records processed
*/
@Override
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
int count = 0;
if (tracker != null) {
Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());
if (vcs != null) {
for (VariantContext vc : vcs) {
vcfWriter.add(vc);
count++;
}
}
}
return count;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
@Override
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
@Override
public void onTraversalDone(Integer result) {
logger.info(result + " records processed.");
}
}

View File

@ -194,6 +194,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram {
*/
private static final List<String> gatkPackages = Arrays.asList(
"org.broadinstitute.sting.gatk",
"org.broadinstitute.sting.pipeline",
"org.broadinstitute.sting.analyzecovariates",
"org.broadinstitute.sting.gatk.datasources.reads.utilities");
@ -251,7 +252,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram {
*/
private void writeFilter(String className, List<? extends ArgumentField> argumentFields, Set<Class<?>> dependents) throws IOException {
String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction",
className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents);
className, "", false, String.format(" + \" --read_filter %s\"", className), argumentFields, dependents);
writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content);
}

View File

@ -0,0 +1,90 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.R;
import org.apache.commons.lang.StringUtils;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
public class RUtils {
/**
* Converts a collection of values to an R compatible list. A null list will return NA,
* otherwise the values will be escaped with single quotes and combined with c().
* @param list Collection of values
* @return The R representation of the list
*/
public static String toStringList(Collection<? extends CharSequence> list) {
if (list == null)
return "NA";
if (list.size() == 0)
return "c()";
return "c('" + StringUtils.join(list, "','") + "')";
}
/**
* Converts a collection of values to an R compatible list. A null list will return NA,
* otherwise the values will be combined with c().
* @param list Collection of values
* @return The R representation of the list
*/
public static String toNumberList(Collection<? extends Number> list) {
return list == null ? "NA": "c(" + StringUtils.join(list, ",") + ")";
}
/**
* Converts a collection of values to an R compatible list. A null list will return NA,
* otherwise the date will be escaped with single quotes and combined with c().
* @param list Collection of values
* @return The R representation of the list
*/
public static String toDateList(Collection<? extends Date> list) {
return toDateList(list, "''yyyy-MM-dd''");
}
/**
* Converts a collection of values to an R compatible list formatted by pattern.
* @param list Collection of values
* @param pattern format pattern string for each date
* @return The R representation of the list
*/
public static String toDateList(Collection<? extends Date> list, String pattern) {
if (list == null)
return "NA";
SimpleDateFormat format = new SimpleDateFormat(pattern);
StringBuilder sb = new StringBuilder();
sb.append("c(");
boolean first = true;
for (Date date : list) {
if (!first) sb.append(",");
sb.append(format.format(date));
first = false;
}
sb.append(")");
return sb.toString();
}
}

View File

@ -31,14 +31,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.text.ListFileUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@ -74,10 +73,10 @@ public class SampleUtils {
* Same as @link getSAMFileSamples but gets all of the samples
* in the SAM files loaded by the engine
*
* @param engine
* @return
* @param engine engine
* @return samples
*/
public final static Set<String> getSAMFileSamples(GenomeAnalysisEngine engine) {
public static Set<String> getSAMFileSamples(GenomeAnalysisEngine engine) {
return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader());
}
@ -209,89 +208,24 @@ public class SampleUtils {
* we try to read a file named E from disk, and if possible all lines from that file are expanded
* into unique sample names.
*
* @param sampleArgs
* @return
* @param sampleArgs args
* @return samples
*/
public static Set<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
if (sampleArgs != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
Set<String> samplesFromFiles = new HashSet<String>();
for (String SAMPLE_EXPRESSION : sampleArgs) {
File sampleFile = new File(SAMPLE_EXPRESSION);
try {
XReadLines reader = new XReadLines(sampleFile);
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line.trim());
}
} catch (FileNotFoundException e) {
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
}
}
return samplesFromFiles;
return ListFileUtils.unpackSet(sampleArgs);
}
return new HashSet<String>();
}
public static Set<String> getSamplesFromCommandLineInput(Collection<String> vcfSamples, Collection<String> sampleExpressions) {
Set<String> samples = new HashSet<String>();
if (sampleExpressions != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
Set<String> samplesFromFiles = new HashSet<String>();
for (String sampleExpression : sampleExpressions) {
File sampleFile = new File(sampleExpression);
try {
XReadLines reader = new XReadLines(sampleFile);
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line);
}
} catch (FileNotFoundException e) {
// ignore exception
}
}
sampleExpressions.addAll(samplesFromFiles);
// Let's now assume that the values in sampleExpressions are literal sample names and not regular
// expressions. Extract those samples specifically so we don't make the mistake of selecting more
// than what the user really wants.
Set<String> possibleSampleRegexs = new HashSet<String>();
for (String sampleExpression : sampleExpressions) {
if (!(new File(sampleExpression).exists())) {
if (vcfSamples.contains(sampleExpression)) {
samples.add(sampleExpression);
} else {
possibleSampleRegexs.add(sampleExpression);
}
}
}
// Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions
for (String sampleRegex : possibleSampleRegexs) {
Pattern p = Pattern.compile(sampleRegex);
for (String vcfSample : vcfSamples) {
Matcher m = p.matcher(vcfSample);
if (m.find()) {
samples.add(vcfSample);
}
}
}
Set<String> samples = ListFileUtils.unpackSet(vcfSamples);
if (sampleExpressions == null) {
return samples;
} else {
samples.addAll(vcfSamples);
return ListFileUtils.includeMatching(samples, sampleExpressions, false);
}
return samples;
}
/**
@ -304,16 +238,7 @@ public class SampleUtils {
// Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions
Set<String> samples = new HashSet<String>();
if (sampleExpressions != null) {
for (String expression : sampleExpressions) {
Pattern p = Pattern.compile(expression);
for (String originalSample : originalSamples) {
Matcher m = p.matcher(originalSample);
if (m.find()) {
samples.add(originalSample);
}
}
}
samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false));
}
return samples;
}

View File

@ -1,5 +1,28 @@
package org.broadinstitute.sting.utils.codecs.vcf;
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.util.ParsingUtils;
@ -35,6 +58,11 @@ public class VCFHeader {
// the header string indicator
public static final String HEADER_INDICATOR = "#";
public static final String SOURCE_KEY = "source";
public static final String REFERENCE_KEY = "reference";
public static final String CONTIG_KEY = "contig";
public static final String INTERVALS_KEY = "intervals";
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true;
@ -42,6 +70,8 @@ public class VCFHeader {
protected ArrayList<String> sampleNamesInOrder = null;
protected HashMap<String, Integer> sampleNameToOffset = null;
private boolean writeEngineHeaders = true;
private boolean writeCommandLine = true;
/**
* create a VCF header, given a list of meta data and auxillary tags
@ -79,6 +109,7 @@ public class VCFHeader {
* using this header (i.e., read by the VCFCodec) will have genotypes
* occurring in the same order
*
* @param genotypeSampleNamesInAppearenceOrder genotype sample names
*/
protected void buildVCFReaderMaps(List<String> genotypeSampleNamesInAppearenceOrder) {
@ -144,10 +175,7 @@ public class VCFHeader {
* @return a set of the header fields, in order
*/
public Set<HEADER_FIELDS> getHeaderFields() {
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
for (HEADER_FIELDS field : HEADER_FIELDS.values())
fields.add(field);
return fields;
return new LinkedHashSet<HEADER_FIELDS>(Arrays.asList(HEADER_FIELDS.values()));
}
/**
@ -217,7 +245,36 @@ public class VCFHeader {
public VCFHeaderLine getOtherHeaderLine(String key) {
return mOtherMetaData.get(key);
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @return true if additional engine headers will be written to the VCF
*/
public boolean isWriteEngineHeaders() {
return writeEngineHeaders;
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @param writeEngineHeaders true if additional engine headers will be written to the VCF
*/
public void setWriteEngineHeaders(boolean writeEngineHeaders) {
this.writeEngineHeaders = writeEngineHeaders;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @return true if the command line will be written to the VCF
*/
public boolean isWriteCommandLine() {
return writeCommandLine;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @param writeCommandLine true if the command line will be written to the VCF
*/
public void setWriteCommandLine(boolean writeCommandLine) {
this.writeCommandLine = writeCommandLine;
}
}

View File

@ -34,9 +34,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;
/**
* A collection of convenience methods for working with list files.
@ -54,6 +54,7 @@ public class ListFileUtils {
* LIST_FILE_COMMENT_START are ignored.
*
* @param samFiles The sam files, in string format.
* @param parser Parser
* @return a flattened list of the bam files provided
*/
public static List<SAMReaderID> unpackBAMFileList(final List<String> samFiles, final ParsingEngine parser) {
@ -63,10 +64,8 @@ public class ListFileUtils {
inputFileName = expandFileName(inputFileName);
if (inputFileName.toLowerCase().endsWith(".list") ) {
try {
for ( String fileName : new XReadLines(new File(inputFileName), true) ) {
if ( fileName.length() > 0 && ! fileName.startsWith(LIST_FILE_COMMENT_START) ) {
unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName)));
}
for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) {
unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName)));
}
}
catch( FileNotFoundException ex ) {
@ -91,9 +90,11 @@ public class ListFileUtils {
/**
* Convert command-line argument representation of ROD bindings to something more easily understandable by the engine.
* @param RODBindings a text equivale
* @param parser Parser
* @return a list of expanded, bound RODs.
*/
@Deprecated
@SuppressWarnings("unused") // TODO: Who is still using this? External walkers?
public static Collection<RMDTriplet> unpackRODBindingsOldStyle(final Collection<String> RODBindings, final ParsingEngine parser) {
// todo -- this is a strange home for this code. Move into ROD system
Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();
@ -112,7 +113,7 @@ public class ListFileUtils {
String name = positionalTags.get(0);
String type = positionalTags.get(1);
RMDTriplet.RMDStorageType storageType = null;
RMDTriplet.RMDStorageType storageType;
if(tags.getValue("storage") != null)
storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage"));
else if(fileName.toLowerCase().endsWith("stdin"))
@ -129,9 +130,11 @@ public class ListFileUtils {
/**
* Convert command-line argument representation of ROD bindings to something more easily understandable by the engine.
* @param RODBindings a text equivale
* @param parser Parser
* @return a list of expanded, bound RODs.
*/
public static Collection<RMDTriplet> unpackRODBindings(final Collection<RodBinding> RODBindings, final ParsingEngine parser) {
@SuppressWarnings("unchecked")
public static Collection<RMDTriplet> unpackRODBindings(final Collection<RodBinding> RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) {
// todo -- this is a strange home for this code. Move into ROD system
Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();
FeatureManager builderForValidation = new FeatureManager();
@ -142,7 +145,7 @@ public class ListFileUtils {
String name = rodBinding.getName();
String type = rodBinding.getTribbleType();
RMDTriplet.RMDStorageType storageType = null;
RMDTriplet.RMDStorageType storageType;
if(rodBinding.getTags().getValue("storage") != null)
storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage"));
else if(fileName.toLowerCase().endsWith("stdin"))
@ -184,4 +187,157 @@ public class ListFileUtils {
return "/dev/stdin";
return argument;
}
/**
* Returns a new set of values, containing a final set of values expanded from values
* <p/>
* Each element E of values can either be a literal string or a file ending in .list.
* For each E ending in .list we try to read a file named E from disk, and if possible
* all lines from that file are expanded into unique values.
*
* @param values Original values
* @return entries from values or the files listed in values
*/
public static Set<String> unpackSet(Collection<String> values) {
if (values == null)
throw new NullPointerException("values cannot be null");
Set<String> unpackedValues = new LinkedHashSet<String>();
// Let's first go through the list and see if we were given any files.
// We'll add every entry in the file to our set, and treat the entries as
// if they had been specified on the command line.
for (String value : values) {
File file = new File(value);
if (value.toLowerCase().endsWith(".list") && file.exists()) {
try {
unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines());
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, e);
}
} else {
unpackedValues.add(value);
}
}
return unpackedValues;
}
/**
* Returns a new set of values including only values listed by filters
* <p/>
* Each element E of values can either be a literal string or a file. For each E,
* we try to read a file named E from disk, and if possible all lines from that file are expanded
* into unique names.
* <p/>
* Filters may also be a file of filters.
*
* @param values Values or files with values
* @param filters Filters or files with filters
* @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions
* @return entries from values or the files listed in values, filtered by filters
*/
public static Set<String> includeMatching(Collection<String> values, Collection<String> filters, boolean exactMatch) {
return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch);
}
/**
* Converts a type T to a String representation.
*
* @param <T> Type to convert to a String.
*/
public static interface StringConverter<T> {
String convert(T value);
}
/**
* Returns a new set of values including only values matching filters
* <p/>
* Filters may also be a file of filters.
* <p/>
* The converter should convert T to a unique String for each value in the set.
*
* @param values Values or files with values
* @param converter Converts values to strings
* @param filters Filters or files with filters
* @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions
* @return entries from values including only values matching filters
*/
public static <T> Set<T> includeMatching(Collection<T> values, StringConverter<T> converter, Collection<String> filters, boolean exactMatch) {
if (values == null)
throw new NullPointerException("values cannot be null");
if (converter == null)
throw new NullPointerException("converter cannot be null");
if (filters == null)
throw new NullPointerException("filters cannot be null");
Set<String> unpackedFilters = unpackSet(filters);
Set<T> filteredValues = new LinkedHashSet<T>();
Collection<Pattern> patterns = null;
if (!exactMatch)
patterns = compilePatterns(unpackedFilters);
for (T value : values) {
String converted = converter.convert(value);
if (unpackedFilters.contains(converted)) {
filteredValues.add(value);
} else if (!exactMatch) {
for (Pattern pattern : patterns)
if (pattern.matcher(converted).find())
filteredValues.add(value);
}
}
return filteredValues;
}
/**
* Returns a new set of values excluding any values matching filters.
* <p/>
* Filters may also be a file of filters.
* <p/>
* The converter should convert T to a unique String for each value in the set.
*
* @param values Values or files with values
* @param converter Converts values to strings
* @param filters Filters or files with filters
* @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions
* @return entries from values exluding any values matching filters
*/
public static <T> Set<T> excludeMatching(Collection<T> values, StringConverter<T> converter, Collection<String> filters, boolean exactMatch) {
if (values == null)
throw new NullPointerException("values cannot be null");
if (converter == null)
throw new NullPointerException("converter cannot be null");
if (filters == null)
throw new NullPointerException("filters cannot be null");
Set<String> unpackedFilters = unpackSet(filters);
Set<T> filteredValues = new LinkedHashSet<T>();
filteredValues.addAll(values);
Collection<Pattern> patterns = null;
if (!exactMatch)
patterns = compilePatterns(unpackedFilters);
for (T value : values) {
String converted = converter.convert(value);
if (unpackedFilters.contains(converted)) {
filteredValues.remove(value);
} else if (!exactMatch) {
for (Pattern pattern : patterns)
if (pattern.matcher(converted).find())
filteredValues.remove(value);
}
}
return filteredValues;
}
private static Collection<Pattern> compilePatterns(Collection<String> filters) {
Collection<Pattern> patterns = new ArrayList<Pattern>();
for (String filter: filters) {
patterns.add(Pattern.compile(filter));
}
return patterns;
}
protected static final StringConverter<String> IDENTITY_STRING_CONVERTER = new StringConverter<String>() {
@Override
public String convert(String value) {
return value;
}
};
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -12,15 +12,14 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.text;
@ -48,75 +47,92 @@ import java.util.List;
* For the love of god, please use this system for reading lines in a file.
*/
public class XReadLines implements Iterator<String>, Iterable<String> {
private BufferedReader in; // The stream we're reading from
private String nextline = null; // Return value of next call to next()
private boolean trimWhitespace = true;
private final BufferedReader in; // The stream we're reading from
private String nextLine = null; // Return value of next call to next()
private final boolean trimWhitespace;
private final String commentPrefix;
public XReadLines(final File filename) throws FileNotFoundException {
this(new FileReader(filename), true, null);
}
public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException {
this(new FileReader(filename), trimWhitespace, null);
}
/**
* Creates a new xReadLines object to read lines from filename
*
* @param filename
* @throws FileNotFoundException
* @param filename file name
* @param trimWhitespace trim whitespace
* @param commentPrefix prefix for comments or null if no prefix is set
* @throws FileNotFoundException when the file is not found
*/
public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException {
this(new FileReader(filename), trimWhitespace);
public XReadLines(final File filename, final boolean trimWhitespace, final String commentPrefix) throws FileNotFoundException {
this(new FileReader(filename), trimWhitespace, commentPrefix);
}
public XReadLines(final File filename) throws FileNotFoundException {
this(filename, true);
public XReadLines(final InputStream inputStream) throws FileNotFoundException {
this(new InputStreamReader(inputStream), true, null);
}
/**
* Creates a new xReadLines object to read lines from fileReader
*
* @param fileReader
* @throws FileNotFoundException
*/
public XReadLines(final FileReader fileReader, final boolean trimWhitespace) throws FileNotFoundException {
this(new BufferedReader(fileReader), trimWhitespace);
}
public XReadLines(final FileReader fileReader) throws FileNotFoundException {
this(fileReader, true);
public XReadLines(final InputStream inputStream, final boolean trimWhitespace) {
this(new InputStreamReader(inputStream), trimWhitespace, null);
}
/**
* Creates a new xReadLines object to read lines from an input stream
*
* @param inputStream
* @param inputStream input stream
* @param trimWhitespace trim whitespace
* @param commentPrefix prefix for comments or null if no prefix is set
*/
public XReadLines(final InputStream inputStream, final boolean trimWhitespace) {
this(new BufferedReader(new InputStreamReader(inputStream)), trimWhitespace);
}
public XReadLines(final InputStream inputStream) throws FileNotFoundException {
this(inputStream, true);
public XReadLines(final InputStream inputStream, final boolean trimWhitespace, final String commentPrefix) {
this(new InputStreamReader(inputStream), trimWhitespace, commentPrefix);
}
/**
* Creates a new xReadLines object to read lines from an bufferedReader
* Creates a new xReadLines object to read lines from a reader
*
* @param reader
* @param reader reader
*/
public XReadLines(final Reader reader) {
this(reader, true, null);
}
/**
* Creates a new xReadLines object to read lines from an reader
*
* @param reader reader
* @param trimWhitespace trim whitespace
*/
public XReadLines(final Reader reader, final boolean trimWhitespace) {
this(reader, trimWhitespace, null);
}
/**
* Creates a new xReadLines object to read lines from an bufferedReader
*
* @param reader file name
* @param trimWhitespace trim whitespace
* @param commentPrefix prefix for comments or null if no prefix is set
*/
public XReadLines(final Reader reader, final boolean trimWhitespace, final String commentPrefix) {
this.in = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader);
this.trimWhitespace = trimWhitespace;
this.commentPrefix = commentPrefix;
try {
this.in = new BufferedReader(reader);
nextline = readNextLine();
this.trimWhitespace = trimWhitespace;
this.nextLine = readNextLine();
} catch(IOException e) {
throw new IllegalArgumentException(e);
}
}
public XReadLines(final Reader reader) {
this(reader, true);
}
/**
* Reads all of the lines in the file, and returns them as a list of strings
*
* @return
* @return all of the lines in the file.
*/
public List<String> readLines() {
List<String> lines = new LinkedList<String>();
@ -128,38 +144,48 @@ public class XReadLines implements Iterator<String>, Iterable<String> {
/**
* I'm an iterator too...
* @return
* @return an iterator
*/
public Iterator<String> iterator() {
return this;
}
public boolean hasNext() {
return nextline != null;
return this.nextLine != null;
}
/**
* Actually reads the next line from the stream, not accessible publically
* @return
* Actually reads the next line from the stream, not accessible publicly
* @return the next line or null
* @throws IOException if an error occurs
*/
private String readNextLine() throws IOException {
String nextline = in.readLine(); // Read another line
if (nextline != null && trimWhitespace )
nextline = nextline.trim();
return nextline;
String nextLine;
while ((nextLine = this.in.readLine()) != null) {
if (this.trimWhitespace) {
nextLine = nextLine.trim();
if (nextLine.length() == 0)
continue;
}
if (this.commentPrefix != null)
if (nextLine.startsWith(this.commentPrefix))
continue;
break;
}
return nextLine;
}
/**
* Returns the next line (minus whitespace)
* @return
* Returns the next line (optionally minus whitespace)
* @return the next line
*/
public String next() {
try {
String result = nextline;
nextline = readNextLine();
String result = this.nextLine;
this.nextLine = readNextLine();
// If we haven't reached EOF yet
if (nextline == null) {
if (this.nextLine == null) {
in.close(); // And close on EOF
}

View File

@ -42,13 +42,13 @@ public class GATKReportUnitTest extends BaseTest {
Assert.assertEquals(report.getTables().size(), 5);
GATKReportTable countVariants = report.getTable("CountVariants");
Object countVariantsPK = countVariants.getPrimaryKeyByData("dbsnp.eval.none.all");
Object countVariantsPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "all");
Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "63025520");
Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "0");
Assert.assertEquals(countVariants.get(countVariantsPK, "heterozygosity"), 4.73e-06);
GATKReportTable validationReport = report.getTable("ValidationReport");
Object validationReportPK = countVariants.getPrimaryKeyByData("dbsnp.eval.none.novel");
Object validationReportPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "novel");
Assert.assertEquals(validationReport.get(validationReportPK, "PPV"), Double.NaN);
}
@ -79,6 +79,49 @@ public class GATKReportUnitTest extends BaseTest {
Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'");
}
private GATKReportTable makeBasicTable() {
GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value");
GATKReportTable table = report.getTable("TableName");
report.addRow("foo.1", "hello");
report.addRow("foo.2", "world");
return table;
}
@Test
public void testDottedSampleName() {
GATKReportTable table = makeBasicTable();
Object pk;
pk = table.getPrimaryKeyByData("foo.1");
Assert.assertEquals(table.get(pk, "value"), "hello");
pk = table.getPrimaryKeyByData("foo.2");
Assert.assertEquals(table.get(pk, "value"), "world");
}
@Test
public void testFindPrimaryKeyByData() {
GATKReportTable table = makeBasicTable();
Assert.assertNotNull(table.findPrimaryKeyByData("foo.1"));
Assert.assertNotNull(table.findPrimaryKeyByData("foo.1", "hello"));
Assert.assertNotNull(table.findPrimaryKeyByData("foo.2"));
Assert.assertNotNull(table.findPrimaryKeyByData("foo.2", "world"));
Assert.assertNull(table.findPrimaryKeyByData("list", "longer", "than", "column", "count"));
Assert.assertNull(table.findPrimaryKeyByData("short"));
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testEmptyFindPrimaryKeyByData() {
GATKReportTable table = makeBasicTable();
table.findPrimaryKeyByData();
}
@Test(expectedExceptions = NullPointerException.class)
public void testNullFindPrimaryKeyByData() {
GATKReportTable table = makeBasicTable();
table.findPrimaryKeyByData((Object[]) null);
}
@Test
public void testSimpleGATKReport() {
// Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.R;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class RUtilsUnitTest {
@DataProvider(name = "stringLists")
public Object[][] getStringLists() {
return new Object[][] {
new Object[] { null, "NA" },
new Object[] { Collections.EMPTY_LIST, "c()" },
new Object[] { Arrays.asList("1", "2", "3"), "c('1','2','3')" }
};
}
@Test(dataProvider = "stringLists")
public void testToStringList(List<? extends CharSequence> actual, String expected) {
Assert.assertEquals(RUtils.toStringList(actual), expected);
}
@DataProvider(name = "numberLists")
public Object[][] getNumberLists() {
return new Object[][] {
new Object[] { null, "NA" },
new Object[] { Collections.EMPTY_LIST, "c()" },
new Object[] { Arrays.asList(1, 2, 3), "c(1,2,3)" },
new Object[] { Arrays.asList(1D, 2D, 3D), "c(1.0,2.0,3.0)" }
};
}
@Test(dataProvider = "numberLists")
public void testToNumberList(List<? extends Number> actual, String expected) {
Assert.assertEquals(RUtils.toNumberList(actual), expected);
}
}

View File

@ -28,17 +28,14 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.ParsingEngine;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
/**
* Tests selected functionality in the CommandLineExecutable class
@ -74,6 +71,76 @@ public class ListFileUtilsUnitTest extends BaseTest {
performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking);
}
@Test
public void testUnpackSet() throws Exception {
Set<String> expected = new HashSet<String>(Arrays.asList("public/testdata/exampleBAM.bam"));
Set<String> actual;
actual = ListFileUtils.unpackSet(Arrays.asList("public/testdata/exampleBAM.bam"));
Assert.assertEquals(actual, expected);
File tempListFile = createTempListFile("testUnpackSet",
"#",
"public/testdata/exampleBAM.bam",
"#public/testdata/foo.bam",
" # public/testdata/bar.bam"
);
actual = ListFileUtils.unpackSet(Arrays.asList(tempListFile.getAbsolutePath()));
Assert.assertEquals(actual, expected);
}
@DataProvider(name="includeMatchingTests")
public Object[][] getIncludeMatchingTests() {
return new Object[][] {
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("a") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, asSet("a", "ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, Collections.EMPTY_SET },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("a") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, asSet("a", "ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("a", "ab") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, asSet("a", "ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, Collections.EMPTY_SET },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, Collections.EMPTY_SET },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, asSet("a", "ab", "abc") }
};
}
@Test(dataProvider = "includeMatchingTests")
public void testIncludeMatching(Set<String> values, Collection<String> filters, boolean exactMatch, Set<String> expected) {
Set<String> actual = ListFileUtils.includeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch);
Assert.assertEquals(actual, expected);
}
@DataProvider(name="excludeMatchingTests")
public Object[][] getExcludeMatchingTests() {
return new Object[][] {
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, Collections.EMPTY_SET },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, asSet("a", "ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("a") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, Collections.EMPTY_SET },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, Collections.EMPTY_SET },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, asSet("a", "ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("a") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, asSet("a", "ab", "abc") },
new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, Collections.EMPTY_SET }
};
}
@Test(dataProvider = "excludeMatchingTests")
public void testExcludeMatching(Set<String> values, Collection<String> filters, boolean exactMatch, Set<String> expected) {
Set<String> actual = ListFileUtils.excludeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch);
Assert.assertEquals(actual, expected);
}
private static <T> Set<T> asSet(T... args){
return new HashSet<T>(Arrays.asList(args));
}
private File createTempListFile( String tempFilePrefix, String... lines ) throws Exception {
File tempListFile = File.createTempFile(tempFilePrefix, ".list");
tempListFile.deleteOnExit();

View File

@ -0,0 +1,47 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.qscripts.examples
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
/**
* Script used for testing output to /dev/null
*/
class ExampleReadFilter extends QScript {
@Input(doc="The reference file for the bam files.", shortName="R")
var referenceFile: File = _
@Input(doc="Bam file to genotype.", shortName="I")
var bamFile: File = _
def script() {
val genotyper = new UnifiedGenotyper with BadMate
genotyper.reference_sequence = referenceFile
genotyper.memoryLimit = 2
genotyper.input_file :+= bamFile
add(genotyper)
}
}

View File

@ -49,7 +49,6 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) {
else
IntervalUtils.parseIntervalArguments(parser, intervals)
Collections.sort(parsedLocs)
Collections.unmodifiableList(parsedLocs)
val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY)
Collections.unmodifiableList(mergedLocs)
}

View File

@ -32,6 +32,8 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor
* Merges a vcf text file.
*/
class VcfGatherFunction extends CombineVariants with GatherFunction {
this.assumeIdenticalSamples = true
this.suppressCommandLineHeader = true
private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]
@ -43,7 +45,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction {
this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) }
this.out = this.originalOutput
this.assumeIdenticalSamples = true
// NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor
// are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK

View File

@ -136,7 +136,7 @@ object PipelineTest extends BaseTest with Logging {
println(" value (min,target,max) table key metric")
for (validation <- evalSpec.validations) {
val table = report.getTable(validation.table)
val key = table.getPrimaryKeyByData(validation.key)
val key = table.getPrimaryKeyByData(validation.table +: validation.key.split('.') : _*)
val value = String.valueOf(table.get(key, validation.metric))
val inRange = if (value == null) false else validation.inRange(value)
val flag = if (!inRange) "*" else " "

View File

@ -0,0 +1,90 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.pipeline.examples
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.testng.annotations.Test
import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
import org.broadinstitute.sting.BaseTest
class ExampleReadFilterPipelineTest {
@Test
def testExampleReadFilter() {
val spec = new PipelineTestSpec
spec.name = "examplereadfilter"
spec.args = Array(
" -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala",
" -R " + BaseTest.testDir + "exampleFASTA.fasta",
" -I " + BaseTest.testDir + "exampleBAM.bam").mkString
PipelineTest.executeTest(spec)
}
}