Merge branch 'master' into rr

This commit is contained in:
Mauricio Carneiro 2011-08-30 02:50:19 -04:00
commit 7d79de91c5
57 changed files with 1282 additions and 340 deletions

View File

@ -48,8 +48,9 @@
<property name="scaladoc.dir" value="scaladoc" />
<!-- Contracts for Java -->
<!-- To disable, run with -Duse.contracts=false -->
<property name="use.contracts" value="false" />
<!-- By default, enabled only for test targets -->
<!-- To disable for test targets, run with -Duse.contracts=false -->
<!-- To enable for non-test targets, run with -Duse.contracts=true -->
<property name="java.contracts" value="${build.dir}/java/contracts" />
<property name="contracts.version" value="1.0-20110609" />
<property name="cofoja.jar" value="${lib.dir}/cofoja-${contracts.version}.jar"/>
@ -176,6 +177,10 @@
<property name="scala.target" value="core"/>
</target>
<target name="init.usecontracts">
<property name="use.contracts" value="true" />
</target>
<target name="git.describe">
<exec executable="git" outputproperty="git.describe.output" resultproperty="git.describe.exit.value" failonerror="false">
<arg line="describe" />
@ -485,11 +490,16 @@
<pathelement location="${java.classes}" />
</path>
<!-- Run with -Dgatkdocs.include.hidden=true to include documentation for hidden features -->
<condition property="gatkdocs.include.hidden.arg" value="-include-hidden" else="">
<isset property="gatkdocs.include.hidden" />
</condition>
<javadoc doclet="org.broadinstitute.sting.utils.help.GATKDoclet"
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
additionalparam="-private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"> <!-- -test to only do DocumentationTest walker -->
additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"> <!-- -test to only do DocumentationTest walker -->
<sourcefiles>
<union>
<fileset refid="all.java.source.files"/>
@ -744,8 +754,7 @@
</scalac>
</target>
<target name="test.compile" depends="test.java.compile,test.scala.compile">
</target>
<target name="test.compile" depends="init.usecontracts,test.java.compile,test.scala.compile" />
<!-- new scala target -->

View File

@ -0,0 +1,154 @@
library(gsalib)
require("ggplot2")
require("gplots")
#
# Standard command line switch. Can we loaded interactively for development
# or executed with RScript
#
args = commandArgs(TRUE)
onCMDLine = ! is.na(args[1])
if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
outputPDF = NA
}
RUNTIME_UNITS = "(sec)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000
#
# Helper function to aggregate all of the jobs in the report across all tables
#
allJobsFromReport <- function(report) {
names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts")
sub <- lapply(report, function(table) table[,names])
do.call("rbind", sub)
}
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ]
} else {
title = "All jobs, sorted by start time"
allJobs = allJobs[order(allJobs$startTime, decreasing=T), ]
}
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
maxRelTime = max(allJobs$relDoneTime)
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
p <- p + ylab("Job")
p <- p + opts(title=title)
print(p)
}
#
# Plots scheduling efficiency at job events
#
plotProgressByTime <- function(gatkReport) {
allJobs = allJobsFromReport(gatkReport)
nJobs = dim(allJobs)[1]
allJobs = allJobs[order(allJobs$startTime, decreasing=F),]
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
times = sort(c(allJobs$relStartTime, allJobs$relDoneTime))
countJobs <- function(p) {
s = allJobs$relStartTime
e = allJobs$relDoneTime
x = c() # I wish I knew how to make this work with apply
for ( time in times )
x = c(x, sum(p(s, e, time)))
x
}
pending = countJobs(function(s, e, t) s > t)
done = countJobs(function(s, e, t) e < t)
running = nJobs - pending - done
d = data.frame(times=times, pending=pending, running=running, done=done)
p <- ggplot(data=melt(d, id.vars=c("times")), aes(x=times, y=value, color=variable))
p <- p + facet_grid(variable ~ ., scales="free")
p <- p + geom_line(size=2)
p <- p + xlab(paste("Time since start of first job", RUNTIME_UNITS))
p <- p + opts(title = "Job scheduling")
print(p)
}
#
# Creates tables for each job in this group
#
standardColumns = c("jobName", "startTime", "formattedStartTime", "analysisName", "intermediate", "exechosts", "formattedDoneTime", "doneTime", "runtime")
plotGroup <- function(groupTable) {
name = unique(groupTable$analysisName)[1]
groupAnnotations = setdiff(names(groupTable), standardColumns)
sub = groupTable[,c("jobName", groupAnnotations, "runtime")]
sub = sub[order(sub$iteration, sub$jobName, decreasing=F), ]
# create a table showing each job and all annotations
textplot(sub, show.rownames=F)
title(paste("Job summary for", name, "full itemization"), cex=3)
# create the table for each combination of values in the group, listing iterations in the columns
sum = cast(melt(sub, id.vars=groupAnnotations, measure.vars=c("runtime")), ... ~ iteration, fun.aggregate=mean)
textplot(as.data.frame(sum), show.rownames=F)
title(paste("Job summary for", name, "itemizing each iteration"), cex=3)
# as above, but averaging over all iterations
groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
textplot(as.data.frame(sum), show.rownames=F)
title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
}
# print out some useful basic information
print("Report")
print(paste("Project :", inputFileName))
convertUnits <- function(gatkReportData) {
convertGroup <- function(g) {
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
g
}
lapply(gatkReportData, convertGroup)
}
# read the table
gatkReportData <- gsa.read.gatkreport(inputFileName)
gatkReportData <- convertUnits(gatkReportData)
#print(summary(gatkReportData))
if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T)
plotJobsGantt(gatkReportData, F)
plotProgressByTime(gatkReportData)
for ( group in gatkReportData ) {
plotGroup(group)
}
if ( ! is.na(outputPDF) ) {
dev.off()
}

View File

@ -31,6 +31,7 @@ import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@ -38,6 +39,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
@ -91,12 +93,12 @@ import java.util.regex.Pattern;
* -resources resources/ \
* -ignoreQ 5
* </pre>
*
*
*/
@DocumentedGATKFeature(
groupName = "AnalyzeCovariates",
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
groupName = "AnalyzeCovariates",
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
/////////////////////////////
@ -118,7 +120,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
private String PATH_TO_RESOURCES = "public/R/";
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
/**
@ -323,13 +325,14 @@ public class AnalyzeCovariates extends CommandLineProgram {
}
private void callRScripts() {
RScriptExecutor.RScriptArgumentCollection argumentCollection =
new RScriptExecutor.RScriptArgumentCollection(PATH_TO_RSCRIPT, Arrays.asList(PATH_TO_RESOURCES));
RScriptExecutor executor = new RScriptExecutor(argumentCollection, true);
int numReadGroups = 0;
// for each read group
for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) {
Process p;
if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {
String readGroup = readGroupKey.toString();
@ -338,35 +341,19 @@ public class AnalyzeCovariates extends CommandLineProgram {
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
try {
if (DO_INDEL_QUALITY) {
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_indelQuality.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
p.waitFor();
} else {
final String outputFilename = OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat";
if (DO_INDEL_QUALITY) {
executor.callRScripts("plot_indelQuality.R", outputFilename,
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
} else {
if( iii == 1 ) {
// Analyze reported quality
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
p.waitFor();
} else { // Analyze all other covariates
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
p.waitFor();
}
// Analyze reported quality
executor.callRScripts("plot_residualError_QualityScoreCovariate.R", outputFilename,
IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
} else { // Analyze all other covariates
executor.callRScripts("plot_residualError_OtherCovariate.R", outputFilename,
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
}
} catch (InterruptedException e) {
e.printStackTrace();
System.exit(-1);
} catch (IOException e) {
System.out.println("Fatal Exception: Perhaps RScript jobs are being spawned too quickly? One work around is to process fewer read groups using the -numRG option.");
e.printStackTrace();
System.exit(-1);
}
}
} else { // at the maximum number of read groups so break out

View File

@ -379,10 +379,14 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
if ( tribbleType == null )
throw new UserException.CommandLineException(
String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
manager.userFriendlyListOfAvailableFeatures(parameterType)));
if ( ! file.canRead() | !! file.isFile() ) {
throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
} else {
throw new UserException.CommandLineException(
String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
manager.userFriendlyListOfAvailableFeatures(parameterType)));
}
}
}

View File

@ -48,9 +48,10 @@ public class LinearMicroScheduler extends MicroScheduler {
walker.initialize();
Accumulator accumulator = Accumulator.create(engine,walker);
boolean done = walker.isDone();
int counter = 0;
for (Shard shard : shardStrategy ) {
if ( shard == null ) // we ran out of shards that aren't owned
if ( done || shard == null ) // we ran out of shards that aren't owned
break;
if(shard.getShardType() == Shard.ShardType.LOCUS) {
@ -61,6 +62,7 @@ public class LinearMicroScheduler extends MicroScheduler {
Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit());
accumulator.accumulate(dataProvider,result);
dataProvider.close();
if ( walker.isDone() ) break;
}
windowMaker.close();
}
@ -70,6 +72,8 @@ public class LinearMicroScheduler extends MicroScheduler {
accumulator.accumulate(dataProvider,result);
dataProvider.close();
}
done = walker.isDone();
}
Object result = accumulator.finishTraversal();

View File

@ -46,7 +46,6 @@ import org.simpleframework.xml.stream.Format;
import org.simpleframework.xml.stream.HyphenStyle;
import java.io.*;
import java.net.InetAddress;
import java.security.NoSuchAlgorithmException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
@ -230,22 +229,6 @@ public class GATKRunReport {
}
/**
* Helper utility that calls into the InetAddress system to resolve the hostname. If this fails,
* unresolvable gets returned instead.
*
* @return
*/
private String resolveHostname() {
try {
return InetAddress.getLocalHost().getCanonicalHostName();
}
catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw]
return "unresolvable";
// handle exception
}
}
public void postReport(PhoneHomeOption type) {
logger.debug("Posting report of type " + type);
switch (type) {
@ -325,7 +308,7 @@ public class GATKRunReport {
private void postReportToAWSS3() {
// modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html
this.hostName = resolveHostname(); // we want to fill in the host name
this.hostName = Utils.resolveHostname(); // we want to fill in the host name
File localFile = postReportToLocalDisk(new File("./"));
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
if ( localFile != null ) { // we succeeded in creating the local file

View File

@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureSource;
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
@ -45,10 +46,11 @@ import java.io.IOException;
* the basics of what a reference metadata track must contain.
*/
public class RMDTrack {
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
private final static boolean DEBUG = false;
// the basics of a track:
private final Class type; // our type
private final Class recordType; // the underlying records that are produced by this track
private final String name; // the name
private final File file; // the associated file we create the reader from
@ -90,7 +92,6 @@ public class RMDTrack {
*/
public RMDTrack(Class type, String name, File file, FeatureSource reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) {
this.type = type;
this.recordType = codec.getFeatureType();
this.name = name;
this.file = file;
this.reader = reader;
@ -112,19 +113,8 @@ public class RMDTrack {
}
public CloseableIterator<GATKFeature> query(GenomeLoc interval) throws IOException {
return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(interval.getContig(),interval.getStart(),interval.getStop()),this.getName());
}
public CloseableIterator<GATKFeature> query(GenomeLoc interval, boolean contained) throws IOException {
return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(interval.getContig(),interval.getStart(),interval.getStop()),this.getName());
}
public CloseableIterator<GATKFeature> query(String contig, int start, int stop) throws IOException {
return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(contig,start,stop),this.getName());
}
public CloseableIterator<GATKFeature> query(String contig, int start, int stop, boolean contained) throws IOException {
return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(contig,start,stop),this.getName());
if ( DEBUG ) logger.debug("Issuing query for %s: " + interval);
return new FeatureToGATKFeatureIterator(genomeLocParser, reader.query(interval.getContig(),interval.getStart(),interval.getStop()), this.getName());
}
public void close() {

View File

@ -92,6 +92,8 @@ import java.util.regex.Pattern;
* @author Khalid Shakir
*/
public class GATKReportTable {
/** REGEX that matches any table with an invalid name */
public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2;
private String tableName;
private String tableDescription;
@ -111,7 +113,7 @@ public class GATKReportTable {
* @return true if the name is valid, false if otherwise
*/
private boolean isValidName(String name) {
Pattern p = Pattern.compile("[^a-zA-Z0-9_\\-\\.]");
Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX);
Matcher m = p.matcher(name);
return !m.find();

View File

@ -173,7 +173,9 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
* -> those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end
*/
boolean done = walker.isDone();
for (SAMRecord read : iter) {
if ( done ) break;
// get the genome loc from the read
GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read);
@ -194,6 +196,7 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
}
printProgress(dataProvider.getShard(),site);
done = walker.isDone();
}
return sum;

View File

@ -33,6 +33,7 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
LocusView locusView = getLocusView( walker, dataProvider );
boolean done = false;
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
@ -46,7 +47,7 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
// We keep processing while the next reference location is within the interval
while( locusView.hasNext() ) {
while( locusView.hasNext() && ! done ) {
AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
@ -76,15 +77,17 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
if (keepMeP) {
M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
done = walker.isDone();
}
printProgress(dataProvider.getShard(),locus.getLocation());
}
}
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) {
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
// only do this if the walker isn't done!
RodLocusView rodLocusView = (RodLocusView)locusView;
long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {

View File

@ -50,7 +50,9 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
ReadView reads = new ReadView(dataProvider);
List<SAMRecord> pairs = new ArrayList<SAMRecord>();
boolean done = walker.isDone();
for(SAMRecord read: reads) {
if ( done ) break;
dataProvider.getShard().getReadMetrics().incrementNumReadsSeen();
if(pairs.size() == 0 || pairs.get(0).getReadName().equals(read.getReadName())) {
@ -65,6 +67,8 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
printProgress(dataProvider.getShard(),null);
}
done = walker.isDone();
}
// If any data was left in the queue, process it.

View File

@ -82,8 +82,10 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
// get the reference ordered data
ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
boolean done = walker.isDone();
// while we still have more reads
for (SAMRecord read : reads) {
if ( done ) break;
// ReferenceContext -- the reference bases covered by the read
ReferenceContext refContext = null;
@ -106,6 +108,7 @@ public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,Read
GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
printProgress(dataProvider.getShard(),locus);
done = walker.isDone();
}
return sum;
}

View File

@ -126,6 +126,17 @@ public abstract class Walker<MapType, ReduceType> {
public void initialize() { }
/**
* A function for overloading in subclasses providing a mechanism to abort early from a walker.
*
* If this ever returns true, then the Traversal engine will stop executing map calls
* and start the process of shutting down the walker in an orderly fashion.
* @return
*/
public boolean isDone() {
return false;
}
/**
* Provide an initial value for reduce computations.
* @return Initial value of reduce.

View File

@ -234,7 +234,7 @@ public class DiffEngine {
// now that we have a specific list of values we want to show, display them
GATKReport report = new GATKReport();
final String tableName = "diffences";
final String tableName = "differences";
report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false);
GATKReportTable table = report.getTable(tableName);
table.addPrimaryKey("Difference", true);

View File

@ -156,7 +156,7 @@ public class UnifiedArgumentCollection {
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@Hidden
@Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
@Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
public boolean dovit = false;
@Hidden

View File

@ -274,7 +274,7 @@ public class PairHMMIndelErrorModel {
this.doViterbi = dovit;
}
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
@ -754,7 +754,7 @@ public class PairHMMIndelErrorModel {
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
if (indelLikelihoodMap.containsKey(p)) {
HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
int j=0;
for (Allele a: haplotypeMap.keySet()) {
readLikelihoods[readIdx][j++] = el.get(a);
@ -1055,7 +1055,6 @@ public class PairHMMIndelErrorModel {
genotypeLikelihoods[i] -= maxElement;
return genotypeLikelihoods;
}
/**

View File

@ -303,7 +303,10 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
uac.alleles = alleles;
if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
if (deletions >= 0)
uac.MAX_DELETION_FRACTION = deletions;
else
uac.MAX_DELETION_FRACTION = 1.0;
if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
@ -347,12 +350,14 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
}
VariantCallContext call;
if ( vcComp.isSNP() )
if ( vcComp.isSNP() ) {
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
else if ( vcComp.isIndel() ) {
} else if ( vcComp.isIndel() ) {
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
}
else {
} else if ( bamIsTruth ) {
// assume it's a SNP if no variation is present; this is necessary so that we can test supposed monomorphic sites against the truth bam
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
} else {
logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
return counter;
}

View File

@ -62,5 +62,5 @@ public class VariantRecalibratorArgumentCollection {
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
public double PERCENT_BAD_VARIANTS = 0.03;
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.", required=false)
public int MIN_NUM_BAD_VARIANTS = 2000;
public int MIN_NUM_BAD_VARIANTS = 2500;
}

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.apache.poi.hpsf.Variant;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.utils.MathUtils;
@ -165,6 +166,23 @@ import java.util.*;
* -o output.vcf \
* -fraction 0.5
*
* Select only indels from a VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -selectType INDEL
*
* Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -selectType SNP -selectType MNP \
* -restrictAllelesTo MULTIALLELIC
*
* </pre>
*
*/
@ -223,6 +241,18 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
private boolean EXCLUDE_FILTERED = false;
/**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
* For example, a multiallelic record such as:
* 1 100 . A AAA,AAAAA
* will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
* 1 100 . A T
* will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
*/
@Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@ -266,11 +296,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
@Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
private boolean SELECT_SNPS = false;
/**
* This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
* When specified one or more times, a particular type of variant is selected.
*
*/
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();
@Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@ -290,6 +323,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
}
public enum NumberAlleleRestriction {
ALL,
BIALLELIC,
MULTIALLELIC
}
private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
private ArrayList<String> selectNames = new ArrayList<String>();
private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
@ -354,6 +394,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
if (TYPES_TO_INCLUDE.isEmpty()) {
for (VariantContext.Type t : VariantContext.Type.values())
selectedTypes.add(t);
}
else {
for (VariantContext.Type t : TYPES_TO_INCLUDE)
selectedTypes.add(t);
}
// Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@ -494,12 +548,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
return 0;
}
// TODO - add ability to also select MNPs
// TODO - move variant selection arguments to the engine so other walkers can also do this
if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed()))
if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
continue;
if (SELECT_SNPS && !vc.isSNP())
if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
continue;
if (!selectedTypes.contains(vc.getType()))
continue;
VariantContext sub = subsetRecord(vc, samples);

View File

@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* multi-allelic INFO field values can be lists of values.
*/
@Advanced
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
@Hidden
@Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
public boolean logACSum = false;
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
@ -147,22 +151,21 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
out.println(Utils.join("\t", vals));
}
for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
out.println(Utils.join("\t", vals));
}
return 1;
} else {
if ( nRecords >= MAX_RECORDS ) {
logger.warn("Calling sys exit to leave after " + nRecords + " records");
System.exit(0); // todo -- what's the recommend way to abort like this?
}
return 0;
}
return 1;
}
@Override
public boolean isDone() {
boolean done = MAX_RECORDS != -1 && nRecords >= MAX_RECORDS;
if ( done) logger.warn("isDone() will return true to leave after " + nRecords + " records");
return done ;
}
private static final boolean isWildCard(String s) {
@ -176,9 +179,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* @param fields a non-null list of fields to capture from VC
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
* provides a value of NA
* @param kma if true, multiallelic variants are to be kept
* @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
* @return
*/
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
private static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean kma, boolean logsum) {
List<String> vals = new ArrayList<String>();
for ( String field : fields ) {
@ -206,12 +211,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
if (field.equals("AF") || field.equals("AC")) {
if (val.contains(",")) {
// strip [,] and spaces
val = val.replace("[","");
val = val.replace("]","");
val = val.replace(" ","");
}
String afo = val;
double af=0;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double[] afd = new double[afs.length];
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
if (kma && logsum)
af = MathUtils.sum(afd);
else
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
}
else
if (!afo.equals("NA"))
af = Double.valueOf(afo);
val = Double.toString(af);
}
vals.add(val);
@ -220,6 +244,9 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return vals;
}
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
return extractFields(vc, fields, allowMissingData, false, false);
}
//
// default reduce -- doesn't do anything at all
//
@ -243,7 +270,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
if ( vc.hasReferenceBaseForIndel() ) {
if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x=x+new String(new byte[]{refByte});
}
@ -255,7 +282,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
if ( vc.hasReferenceBaseForIndel() ) {
if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x.append(new String(new byte[]{refByte}));
}

View File

@ -0,0 +1,128 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.R;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.utils.PathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
/**
* Generic service for executing RScripts in the GATK directory
*
* @author Your Name
* @since Date created
*/
public class RScriptExecutor {
/**
* our log
*/
protected static Logger logger = Logger.getLogger(RScriptExecutor.class);
public static class RScriptArgumentCollection {
@Advanced
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
public String PATH_TO_RSCRIPT = "Rscript";
@Advanced
@Argument(fullName = "path_to_Rresources", shortName = "Rresources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
public List<String> PATH_TO_RESOURCES = Arrays.asList("public/R/", "private/R/");
public RScriptArgumentCollection() {}
/** For testing and convenience */
public RScriptArgumentCollection(final String PATH_TO_RSCRIPT, final List<String> PATH_TO_RESOURCES) {
this.PATH_TO_RSCRIPT = PATH_TO_RSCRIPT;
this.PATH_TO_RESOURCES = PATH_TO_RESOURCES;
}
}
final RScriptArgumentCollection myArgs;
final boolean exceptOnError;
public RScriptExecutor(final RScriptArgumentCollection myArgs, final boolean exceptOnError) {
this.myArgs = myArgs;
this.exceptOnError = exceptOnError;
}
public void callRScripts(String scriptName, Object... scriptArgs) {
callRScripts(scriptName, Arrays.asList(scriptArgs));
}
public void callRScripts(String scriptName, List<Object> scriptArgs) {
try {
final File pathToScript = findScript(scriptName);
if ( pathToScript == null ) return; // we failed but shouldn't exception out
final String argString = Utils.join(" ", scriptArgs);
final String cmdLine = Utils.join(" ", Arrays.asList(myArgs.PATH_TO_RSCRIPT, pathToScript, argString));
logger.info("Executing RScript: " + cmdLine);
Runtime.getRuntime().exec(cmdLine).waitFor();
} catch (InterruptedException e) {
generateException(e);
} catch (IOException e) {
generateException("Fatal Exception: Perhaps RScript jobs are being spawned too quickly?", e);
}
}
public File findScript(final String scriptName) {
for ( String pathToResource : myArgs.PATH_TO_RESOURCES ) {
final File f = new File(pathToResource + "/" + scriptName);
if ( f.exists() ) {
if ( f.canRead() )
return f;
else
generateException("Script exists but couldn't be read: " + scriptName);
}
}
generateException("Couldn't find script: " + scriptName + " in " + myArgs.PATH_TO_RESOURCES);
return null;
}
private void generateException(String msg) {
generateException(msg, null);
}
private void generateException(Throwable e) {
generateException("", e);
}
private void generateException(String msg, Throwable e) {
if ( exceptOnError )
throw new UserException(msg, e);
else
logger.warn(msg + (e == null ? "" : ":" + e.getMessage()));
}
}

View File

@ -29,6 +29,7 @@ import net.sf.samtools.util.StringUtil;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.collections.Pair;
import java.net.InetAddress;
import java.util.*;
/**
@ -633,4 +634,20 @@ public class Utils {
public static boolean isFlagSet(int value, int flag) {
return ((value & flag) == flag);
}
/**
* Helper utility that calls into the InetAddress system to resolve the hostname. If this fails,
* unresolvable gets returned instead.
*
* @return
*/
public static final String resolveHostname() {
try {
return InetAddress.getLocalHost().getCanonicalHostName();
}
catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw]
return "unresolvable";
// handle exception
}
}
}

View File

@ -281,7 +281,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
VariantContext vc = null;
try {
vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes);
vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]);
} catch (Exception e) {
generateException(e.getMessage());
}
@ -290,8 +290,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
if ( !header.samplesWereAlreadySorted() )
vc.getGenotypes();
// Trim bases of all alleles if necessary
return createVariantContextWithTrimmedAlleles(vc);
return vc;
}
/**
@ -516,25 +515,44 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
return true;
}
private static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
boolean clipping = true;
// Note that the computation of forward clipping here is meant only to see whether there is a common
// base to all alleles, and to correctly compute reverse clipping,
// but it is not used for actually changing alleles - this is done in function
// createVariantContextWithTrimmedAlleles() below.
for (Allele a : unclippedAlleles) {
if (a.isSymbolic()) {
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() )
continue;
}
if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) {
clipping = false;
break;
}
}
return (clipping) ? 1 : 0;
return (clipping) ? 1 : 0;
}
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
int clipping = 0;
boolean stillClipping = true;
while ( stillClipping ) {
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() )
continue;
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
stillClipping = false;
else if ( ref.length() == clipping )
generateException("bad alleles encountered", lineNo);
else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] )
stillClipping = false;
}
if ( stillClipping )
clipping++;
}
return clipping;
}
/**
* clip the alleles, based on the reference
*
@ -542,122 +560,30 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
* @param ref the reference string
* @param unclippedAlleles the list of unclipped alleles
* @param clippedAlleles output list of clipped alleles
* @param lineNo the current line number in the file
* @return the new reference end position of this event
*/
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
// Note that the computation of forward clipping here is meant only to see whether there is a common
// base to all alleles, and to correctly compute reverse clipping,
// but it is not used for actually changing alleles - this is done in function
// createVariantContextWithTrimmedAlleles() below.
int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
int reverseClipped = 0;
boolean clipping = true;
while (clipping) {
for (Allele a : unclippedAlleles) {
if (a.isSymbolic()) {
continue;
}
if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
clipping = false;
else if (ref.length() == reverseClipped)
generateException("bad alleles encountered", lineNo);
else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
clipping = false;
}
if (clipping) reverseClipped++;
}
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
if ( clippedAlleles != null ) {
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() ) {
clippedAlleles.add(a);
} else {
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference()));
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference()));
}
}
}
// the new reference length
int refLength = ref.length() - reverseClipped;
int refLength = ref.length() - reverseClipping;
return position+Math.max(refLength - 1,0);
}
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
boolean trimVC;
// We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles
Allele refAllele = inputVC.getReference();
if (!inputVC.isVariant())
trimVC = false;
else if (refAllele.isNull())
trimVC = false;
else {
trimVC = (computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
inputVC.getReference().getDisplayString()) > 0);
}
// nothing to do if we don't need to trim bases
if (trimVC) {
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
// set the reference base for indels in the attributes
Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
for (Allele a : inputVC.getAlleles()) {
if (a.isSymbolic()) {
alleles.add(a);
originalToTrimmedAlleleMap.put(a, a);
} else {
// get bases for current allele and create a new one with trimmed bases
byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length());
Allele trimmedAllele = Allele.create(newBases, a.isReference());
alleles.add(trimmedAllele);
originalToTrimmedAlleleMap.put(a, trimmedAllele);
}
}
// detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation
// example: mixed records such as {TA*,TGA,TG}
boolean hasNullAlleles = false;
for (Allele a: originalToTrimmedAlleleMap.values()) {
if (a.isNull())
hasNullAlleles = true;
if (a.isReference())
refAllele = a;
}
if (!hasNullAlleles)
return inputVC;
// now we can recreate new genotypes with trimmed alleles
for ( Map.Entry<String, Genotype> sample : inputVC.getGenotypes().entrySet() ) {
List<Allele> originalAlleles = sample.getValue().getAlleles();
List<Allele> trimmedAlleles = new ArrayList<Allele>();
for ( Allele a : originalAlleles ) {
if ( a.isCalled() )
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
else
trimmedAlleles.add(Allele.NO_CALL);
}
genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles));
}
return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0]));
}
return inputVC;
}
public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
try {
return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||

View File

@ -116,10 +116,26 @@ public class VCFUtils {
return fields;
}
/** Only displays a warning if a logger is provided and an identical warning hasn't been already issued */
private static final class HeaderConflictWarner {
Logger logger;
Set<String> alreadyIssued = new HashSet<String>();
private HeaderConflictWarner(final Logger logger) {
this.logger = logger;
}
public void warn(final VCFHeaderLine line, final String msg) {
if ( logger != null && ! alreadyIssued.contains(line.getKey()) ) {
alreadyIssued.add(line.getKey());
logger.warn(msg);
}
}
}
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers, Logger logger) throws IllegalStateException {
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
HeaderConflictWarner conflictWarner = new HeaderConflictWarner(logger);
// todo -- needs to remove all version headers from sources and add its own VCF version line
for ( VCFHeader source : headers ) {
@ -152,24 +168,24 @@ public class VCFUtils {
// number, then this value should be 1. However, if the INFO field describes a pair
// of numbers, then this value should be 2 and so on. If the number of possible
// values varies, is unknown, or is unbounded, then this value should be '.'.
if ( logger != null ) logger.warn("Promoting header field Number to . due to number differences in header lines: " + line + " " + other);
conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other);
compOther.setNumberToUnbounded();
} else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) {
// promote key to Float
if ( logger != null ) logger.warn("Promoting Integer to Float in header: " + compOther);
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
map.put(key, compOther);
} else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) {
// promote key to Float
if ( logger != null ) logger.warn("Promoting Integer to Float in header: " + compOther);
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
} else {
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
}
}
if ( ! compLine.getDescription().equals(compOther) )
if ( logger != null ) logger.warn("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine);
conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine);
} else {
// we are not equal, but we're not anything special either
if ( logger != null ) logger.warn("Ignoring header line already in map: this header line = " + line + " already present header = " + other);
conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other);
}
} else {
map.put(key, line);

View File

@ -209,6 +209,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
/**
* the complete constructor. Makes a complete VariantContext from its arguments
* This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES.
*
* @param source source
* @param contig the contig
@ -257,9 +258,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
* @param negLog10PError qual
* @param filters filters: use null for unfiltered and empty set for passes filters
* @param attributes attributes
* @param referenceBaseForIndel padded reference base
*/
public VariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, null, true);
public VariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, Byte referenceBaseForIndel) {
this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true);
}
/**
@ -293,7 +295,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
}
/**
* Create a new variant context without genotypes and no Perror, no filters, and no attributes
* Create a new variant context with genotypes but without Perror, filters, and attributes
*
* @param source source
* @param contig the contig

View File

@ -657,12 +657,84 @@ public class VariantContextUtils {
VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) );
// Trim the padded bases of all alleles if necessary
merged = AbstractVCFCodec.createVariantContextWithTrimmedAlleles(merged);
merged = createVariantContextWithTrimmedAlleles(merged);
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
return merged;
}
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
boolean trimVC;
// We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles
Allele refAllele = inputVC.getReference();
if (!inputVC.isVariant())
trimVC = false;
else if (refAllele.isNull())
trimVC = false;
else {
trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
inputVC.getReference().getDisplayString()) > 0);
}
// nothing to do if we don't need to trim bases
if (trimVC) {
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
// set the reference base for indels in the attributes
Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
for (Allele a : inputVC.getAlleles()) {
if (a.isSymbolic()) {
alleles.add(a);
originalToTrimmedAlleleMap.put(a, a);
} else {
// get bases for current allele and create a new one with trimmed bases
byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length());
Allele trimmedAllele = Allele.create(newBases, a.isReference());
alleles.add(trimmedAllele);
originalToTrimmedAlleleMap.put(a, trimmedAllele);
}
}
// detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation
// example: mixed records such as {TA*,TGA,TG}
boolean hasNullAlleles = false;
for (Allele a: originalToTrimmedAlleleMap.values()) {
if (a.isNull())
hasNullAlleles = true;
if (a.isReference())
refAllele = a;
}
if (!hasNullAlleles)
return inputVC;
// now we can recreate new genotypes with trimmed alleles
for ( Map.Entry<String, Genotype> sample : inputVC.getGenotypes().entrySet() ) {
List<Allele> originalAlleles = sample.getValue().getAlleles();
List<Allele> trimmedAlleles = new ArrayList<Allele>();
for ( Allele a : originalAlleles ) {
if ( a.isCalled() )
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
else
trimmedAlleles.add(Allele.NO_CALL);
}
genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles));
}
return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0]));
}
return inputVC;
}
public static Map<String, Genotype> stripPLs(Map<String, Genotype> genotypes) {
Map<String, Genotype> newGs = new HashMap<String, Genotype>(genotypes.size());

View File

@ -40,6 +40,8 @@ import java.util.EnumSet;
* Test suite for the parsing engine.
*/
public class ParsingEngineUnitTest extends BaseTest {
/** we absolutely cannot have this file existing, or we'll fail the UnitTest */
private final static String NON_EXISTANT_FILENAME_VCF = "this_file_should_not_exist_on_disk_123456789.vcf";
private ParsingEngine parsingEngine;
@BeforeMethod
@ -636,7 +638,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test
public void basicRodBindingArgumentTest() {
final String[] commandLine = new String[] {"-V:vcf","foo.vcf"};
final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -646,7 +648,7 @@ public class ParsingEngineUnitTest extends BaseTest {
parsingEngine.loadArgumentsIntoObject( argProvider );
Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(argProvider.binding.getSource(), "foo.vcf", "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value");
Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value");
Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
@ -659,7 +661,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test
public void shortNameOnlyRodBindingArgumentTest() {
final String[] commandLine = new String[] {"-short:vcf","foo.vcf"};
final String[] commandLine = new String[] {"-short:vcf",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( ShortNameOnlyRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -669,7 +671,7 @@ public class ParsingEngineUnitTest extends BaseTest {
parsingEngine.loadArgumentsIntoObject( argProvider );
Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(argProvider.binding.getSource(), "foo.vcf", "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value");
Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value");
Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
@ -711,7 +713,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test(expectedExceptions = UserException.class)
public void rodBindingArgumentTestMissingType() {
final String[] commandLine = new String[] {"-V","foo.vcf"};
final String[] commandLine = new String[] {"-V",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -723,7 +725,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test(expectedExceptions = UserException.class)
public void rodBindingArgumentTestTooManyTags() {
final String[] commandLine = new String[] {"-V:x,y,z","foo.vcf"};
final String[] commandLine = new String[] {"-V:x,y,z",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -740,7 +742,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test
public void variantContextBindingArgumentTest() {
final String[] commandLine = new String[] {"-V:vcf","foo.vcf"};
final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -750,14 +752,14 @@ public class ParsingEngineUnitTest extends BaseTest {
parsingEngine.loadArgumentsIntoObject( argProvider );
Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(argProvider.binding.getSource(), "foo.vcf", "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
}
@Test
public void variantContextBindingArgumentTestVCF3() {
final String[] commandLine = new String[] {"-V:vcf3","foo.vcf"};
final String[] commandLine = new String[] {"-V:vcf3",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -767,7 +769,7 @@ public class ParsingEngineUnitTest extends BaseTest {
parsingEngine.loadArgumentsIntoObject( argProvider );
Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(argProvider.binding.getSource(), "foo.vcf", "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
}
@ -779,7 +781,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test
public void listRodBindingArgumentTest() {
final String[] commandLine = new String[] {"-V:vcf","foo.vcf"};
final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( ListRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -791,14 +793,14 @@ public class ParsingEngineUnitTest extends BaseTest {
Assert.assertEquals(argProvider.bindings.size(), 1, "Unexpected number of bindings");
RodBinding<Feature> binding = argProvider.bindings.get(0);
Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(binding.getSource(), "foo.vcf", "Source isn't set to its expected value");
Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value");
Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
}
@Test
public void listRodBindingArgumentTest2Args() {
final String[] commandLine = new String[] {"-V:vcf","foo.vcf", "-V:vcf", "bar.vcf"};
final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF, "-V:vcf", "bar.vcf"};
parsingEngine.addArgumentSource( ListRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
@ -811,7 +813,7 @@ public class ParsingEngineUnitTest extends BaseTest {
RodBinding<Feature> binding = argProvider.bindings.get(0);
Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(binding.getSource(), "foo.vcf", "Source isn't set to its expected value");
Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value");
Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
@ -838,7 +840,7 @@ public class ParsingEngineUnitTest extends BaseTest {
@Test
public void listRodBindingArgumentTestExplicitlyNamed() {
final String[] commandLine = new String[] {"-V:foo,vcf","foo.vcf", "-V:foo,vcf", "bar.vcf"};
final String[] commandLine = new String[] {"-V:foo,vcf",NON_EXISTANT_FILENAME_VCF, "-V:foo,vcf", "bar.vcf"};
parsingEngine.addArgumentSource( ListRodBindingArgProvider.class );
parsingEngine.parse( commandLine );

View File

@ -50,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
@DataProvider(name = "data")
public Object[][] createData() {
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "92311de76dda3f38aac289d807ef23d0");
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "0c69412c385fda50210f2a612e1ffe4a");
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dc1ca75c6ecf32641967d61e167acfff");
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "df0fcb568a3a49fc74830103b2e26f6c");
return TestParams.getTests(TestParams.class);
}

View File

@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testConcordance--" + testFile, spec);
}
@Test
public void testVariantTypeSelection() {
String testFile = validationDataLocation + "complexExample1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER",
1,
Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf")
);
executeTest("testVariantTypeSelection--" + testFile, spec);
}
@Test(enabled=false)
public void testRemovePLs() {
String testFile = validationDataLocation + "combine.3.vcf";

View File

@ -40,6 +40,7 @@ import java.util.Arrays;
import java.util.List;
public class LibDrmaaIntegrationTest extends BaseTest {
private String implementation = null;
@Test
public void testDrmaa() throws Exception {
@ -79,10 +80,24 @@ public class LibDrmaaIntegrationTest extends BaseTest {
Assert.fail(String.format("Could not get DRMAA implementation from the DRMAA library: %s", error.getString(0)));
System.out.println(String.format("DRMAA implementation(s): %s", drmaaImplementation.getString(0)));
this.implementation = drmaaImplementation.getString(0);
}
@Test
@Test(dependsOnMethods = { "testDrmaa" })
public void testSubmitEcho() throws Exception {
if (implementation.indexOf("LSF") >= 0) {
System.err.println(" *********************************************************");
System.err.println(" ***********************************************************");
System.err.println(" **** ****");
System.err.println(" **** Skipping LibDrmaaIntegrationTest.testSubmitEcho() ****");
System.err.println(" **** Are you using the dotkit .combined_LSF_SGE? ****");
System.err.println(" **** ****");
System.err.println(" ***********************************************************");
System.err.println(" *********************************************************");
return;
}
Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER);
int errnum;
@ -129,7 +144,7 @@ public class LibDrmaaIntegrationTest extends BaseTest {
errnum = LibDrmaa.drmaa_set_vector_attribute(jt, LibDrmaa.DRMAA_V_ARGV, args, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);
if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS)
Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0)));
Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_V_ARGV, error.getString(0)));
errnum = LibDrmaa.drmaa_run_job(jobIdMem, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.R;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.diffengine.DiffElement;
import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine;
import org.broadinstitute.sting.gatk.walkers.diffengine.DiffNode;
import org.broadinstitute.sting.gatk.walkers.diffengine.Difference;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Basic unit test for RScriptExecutor in reduced reads
*/
public class RScriptExecutorUnitTest extends BaseTest {
final static String testrscript = "print(\"hello, world\")\n";
final static String publicRScript = "plot_Tranches.R";
final static String privateRScript = "variantCallQC.R";
// --------------------------------------------------------------------------------
//
// Difference testing routines
//
// --------------------------------------------------------------------------------
private void testOne(String script, String pathToRscript, String anotherSearchPath, boolean exceptOnError) {
RScriptExecutor.RScriptArgumentCollection collection =
new RScriptExecutor.RScriptArgumentCollection();
if ( pathToRscript != null )
collection.PATH_TO_RSCRIPT = pathToRscript;
if ( anotherSearchPath != null ) {
List<String> x = new ArrayList<String>(collection.PATH_TO_RESOURCES);
x.add(anotherSearchPath);
collection.PATH_TO_RESOURCES = x;
}
RScriptExecutor executor = new RScriptExecutor(collection, exceptOnError);
executor.callRScripts(script);
}
@Test
public void testPublic() { testOne(publicRScript, null, null, true); }
@Test
public void testPrivate() { testOne(privateRScript, null, null, true); }
// make sure we don't break finding something in private by adding another directory
@Test
public void testPrivateWithAdditionalPath1() { testOne(privateRScript, null, "dist", true); }
// make sure we don't break finding something in private by adding another directory
@Test
public void testPrivateWithAdditionalPath2() { testOne(privateRScript, null, "doesNotExist", true); }
@Test(expectedExceptions = UserException.class)
public void testNonExistantScriptException() { testOne("does_not_exist.R", null, null, true); }
@Test()
public void testNonExistantScriptNoException() { testOne("does_not_exist.R", null, null, false); }
@Test(expectedExceptions = UserException.class)
public void testNonExistantRScriptException() { testOne(publicRScript, "badRScriptValue", null, true); }
@Test()
public void testNonExistantRScriptNoException() { testOne(publicRScript, "badRScriptValue", null, false); }
@Test()
public void testScriptInNewPath() throws IOException {
File t = createTempFile("myTestScript", ".R");
FileUtils.writeStringToFile(t, testrscript);
testOne(t.getName(), null, t.getParent(), true);
}
}

View File

@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript {
// Because the realignment only happens after these scripts are executed, in case you are using
// bwa realignment, this function will operate over the original bam files and output over the
// (to be realigned) bam files.
def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = {
def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = {
// Creating a table with SAMPLE information from each input BAM file
val sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript {
sampleTable(sample) :+= rBam
}
}
return sampleTable.toMap
println("\n\n*** INPUT FILES ***\n")
// Creating one file for each sample in the dataset
val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
for ((sample, flist) <- sampleTable) {
println(sample + ":")
for (f <- flist)
println (f)
println()
val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
sampleBamFiles(sample) = sampleFileName
add(writeList(flist, sampleFileName))
}
println("*** INPUT FILES ***\n\n")
return sampleBamFiles.toMap
// println("\n\n*** INPUT FILES ***\n")
// // Creating one file for each sample in the dataset
// val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
// for ((sample, flist) <- sampleTable) {
//
// println(sample + ":")
// for (f <- flist)
// println (f)
// println()
//
// val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
// sampleBamFiles(sample) = sampleFileName
// //add(writeList(flist, sampleFileName))
// }
// println("*** INPUT FILES ***\n\n")
//
// return sampleBamFiles.toMap
}
// Rebuilds the Read Group string to give BWA
@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript {
def script = {
// final output list of processed bam files
var cohortList: List[File] = List()
// sets the model for the Indel Realigner
cleanModelEnum = getIndelCleaningModel()
// keep a record of the number of contigs in the first bam file in the list
@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript {
val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)}
// Generate a BAM file per sample joining all per lane files if necessary
val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs)
// generate a BAM file per sample joining all per lane files if necessary
val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs)
// Final output list of processed bam files
var cohortList: List[File] = List()
// Simple progress report
println("\nFound the following samples: ")
for ((sample, file) <- sampleBAMFiles)
println("\t" + sample + " -> " + file)
println("\n")
// If this is a 'knowns only' indel realignment run, do it only once for all samples.
// if this is a 'knowns only' indel realignment run, do it only once for all samples.
val globalIntervals = new File(outputDir + projectName + ".intervals")
if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(null, globalIntervals))
// Put each sample through the pipeline
for ((sample, sampleFile) <- sampleBAMFiles) {
val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile}
// put each sample through the pipeline
for ((sample, bamList) <- sampleBAMFiles) {
// BAM files generated by the pipeline
val bam = new File(qscript.projectName + "." + sample + ".bam")
val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript {
// Validation is an optional step for the BAM file generated after
// alignment and the final bam file of the pipeline.
if (!noValidation && sampleFile.endsWith(".bam")) { // todo -- implement validation for .list BAM files
if (!noValidation) {
for (sampleFile <- bamList)
add(validate(sampleFile, preValidateLog),
validate(recalBam, postValidateLog))
}
if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(sampleFile, targetIntervals))
add(target(bamList, targetIntervals))
add(clean(sampleFile, targetIntervals, cleanedBam),
add(clean(bamList, targetIntervals, cleanedBam),
dedup(cleanedBam, dedupedBam, metricsFile),
cov(dedupedBam, preRecalFile),
recal(dedupedBam, preRecalFile, recalBam),
@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript {
this.maxRecordsInRam = 100000
}
case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
this.input_file :+= inBams
this.input_file = inBams
this.out = outIntervals
this.mismatchFraction = 0.0
this.known :+= qscript.dbSNP
@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outIntervals + ".target"
}
case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= inBams
case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
this.input_file = inBams
this.targetIntervals = tIntervals
this.out = outBam
this.known :+= qscript.dbSNP

View File

@ -4,9 +4,9 @@ import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.util.QScriptUtils
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
import org.broadinstitute.sting.utils.exceptions.UserException
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups}
/**
* Created by IntelliJ IDEA.
@ -16,7 +16,7 @@ import org.broadinstitute.sting.commandline.Hidden
*/
class RecalibrateBaseQualities extends QScript {
class PacbioProcessingPipeline extends QScript {
@Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
var input: File = _
@ -39,13 +39,16 @@ class RecalibrateBaseQualities extends QScript {
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
var BLASR_BAM: Boolean = false
@Hidden
@Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
var dbq: Int = 20
@Hidden
@Input(shortName="bwastring", required=false)
var bwastring: String = ""
val queueLogDir: String = ".qlog/"
@ -57,8 +60,6 @@ class RecalibrateBaseQualities extends QScript {
var USE_BWA: Boolean = false
println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
if (file.endsWith(".fasta") || file.endsWith(".fq")) {
if (bwaPath == null) {
throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
@ -69,28 +70,34 @@ class RecalibrateBaseQualities extends QScript {
// FASTA -> BAM steps
val alignedSam: File = file.getName + ".aligned.sam"
val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
val rgBam: File = swapExt(file, ".bam", ".rg.bam")
val bamBase = if (USE_BWA) {rgBam} else {file}
// BAM Steps
val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam")
val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam")
val path1: String = recalBam + ".before"
val path2: String = recalBam + ".after"
if (USE_BWA) {
add(align(file, alignedSam),
sortSam(alignedSam, sortedBam),
addQuals(sortedBam, qualBam, dbq),
addReadGroup(qualBam, rgBam, sample))
addReadGroup(sortedBam, rgBam, sample))
}
add(cov(bamBase, recalFile1),
recal(bamBase, recalFile1, recalBam),
else if (BLASR_BAM) {
val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam")
add(reorder(bamBase, reorderedBAM),
changeMQ(reorderedBAM, mqBAM))
}
val bam = if (BLASR_BAM) {mqBAM} else {bamBase}
add(cov(bam, recalFile1),
recal(bam, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
@ -110,13 +117,13 @@ class RecalibrateBaseQualities extends QScript {
case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam
def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam
this.memoryLimit = 8
this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
this.jobName = queueLogDir + outSam + ".bwa_sam_se"
}
case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs {
@Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs {
this.input = List(inSam)
this.output = outBam
this.sortOrder = SortOrder.coordinate
@ -124,10 +131,16 @@ class RecalibrateBaseQualities extends QScript {
this.jobName = queueLogDir + outBam + ".sortSam"
}
case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs {
case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs {
this.input = List(inSam)
this.output = outSam
this.sortReference = reference
}
case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
this.input_file :+= inBam
this.out = outBam
this.DBQ = qual
this.read_filter :+= "ReassignMappingQuality"
}
case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
@ -145,7 +158,8 @@ class RecalibrateBaseQualities extends QScript {
}
case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.DBQ = dbq
this.knownSites :+= dbSNP
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
this.input_file :+= inBam
this.recal_file = outRecalFile
@ -155,6 +169,7 @@ class RecalibrateBaseQualities extends QScript {
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
this.DBQ = dbq
this.input_file :+= inBam
this.recal_file = inRecalFile
this.out = outBam

View File

@ -95,7 +95,8 @@ class QCommandLine extends CommandLineProgram with Logging {
def execute = {
qGraph.settings = settings
for (script <- pluginManager.createAllTypes()) {
val allQScripts = pluginManager.createAllTypes();
for (script <- allQScripts) {
logger.info("Scripting " + pluginManager.getName(script.getClass.asSubclass(classOf[QScript])))
loadArgumentsIntoObject(script)
try {
@ -108,14 +109,26 @@ class QCommandLine extends CommandLineProgram with Logging {
logger.info("Added " + script.functions.size + " functions")
}
// Execute the job graph
qGraph.run()
// walk over each script, calling onExecutionDone
for (script <- allQScripts) {
script.onExecutionDone(qGraph.getFunctionsAndStatus(script.functions), qGraph.success)
if ( ! settings.disableJobReport ) {
val jobStringName = (QScriptUtils.?(settings.jobReportFile)).getOrElse(settings.qSettings.jobNamePrefix + ".jobreport.txt")
val jobReportFile = new File(jobStringName)
logger.info("Writing JobLogging GATKReport to file " + jobReportFile)
QJobReport.printReport(qGraph.getFunctionsAndStatus(script.functions), jobReportFile)
QJobReport.plotReport(settings.rScriptArgs, jobReportFile)
}
}
if (!qGraph.success) {
logger.info("Done with errors")
qGraph.logFailed()
1
} else {
logger.info("Done")
0
}
}

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.queue
import engine.JobRunInfo
import org.broadinstitute.sting.queue.function.QFunction
import annotation.target.field
import io.Source
@ -57,6 +58,16 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon
*/
def script()
/**
* A default handler for the onExecutionDone() function. By default this doesn't do anything
* except print out a fine status message.
*/
def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) {
logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", jobs.size))
// this is too much output
// for ( (f, info) <- jobs ) logger.info(" %s %s".format(f.jobName, info))
}
/**
* The command line functions that will be executed for this QScript.
*/

View File

@ -23,6 +23,8 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod
*/
var depth = -1
val myRunInfo: JobRunInfo = JobRunInfo.default // purely for dryRun testing
/**
* Initializes with the current status of the function.
*/
@ -179,4 +181,8 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod
printWriter.close
IOUtils.writeContents(functionErrorFile, stackTrace.toString)
}
def getRunInfo = {
if ( runner == null ) myRunInfo else runner.getRunInfo
}
}

View File

@ -1,7 +1,9 @@
package org.broadinstitute.sting.queue.engine
import org.broadinstitute.sting.queue.function.InProcessFunction
import org.broadinstitute.sting.queue.util.IOUtils
import java.util.Date
import org.broadinstitute.sting.queue.util.{Logging, IOUtils}
import org.broadinstitute.sting.utils.Utils
/**
* Runs a function that executes in process and does not fork out an external process.
@ -10,8 +12,13 @@ class InProcessRunner(val function: InProcessFunction) extends JobRunner[InProce
private var runStatus: RunnerStatus.Value = _
def start() = {
getRunInfo.startTime = new Date()
getRunInfo.exechosts = Utils.resolveHostname()
runStatus = RunnerStatus.RUNNING
function.run()
getRunInfo.doneTime = new Date()
val content = "%s%nDone.".format(function.description)
IOUtils.writeContents(function.jobOutputFile, content)
runStatus = RunnerStatus.DONE

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.engine
import java.util.Date
import java.text.SimpleDateFormat
/**
* Class containing tracked information about a job run.
*/
// todo -- it might be nice to have the hostname
class JobRunInfo {
/** constant date format */
val formatter = new SimpleDateFormat("yy-MM-dd H:mm:ss:SSS");
/** The start time with millisecond resolution of this job */
var startTime: Date = _
/** The done time with millisecond resolution of this job */
var doneTime: Date = _
var exechosts: String = "localhost"
def getStartTime = startTime
def getDoneTime = doneTime
def getFormattedStartTime = formatTime(getStartTime)
def getFormattedDoneTime = formatTime(getDoneTime)
/** Helper function that pretty prints the date */
private def formatTime(d: Date) = if ( d != null ) formatter.format(d) else "null"
def getExecHosts = exechosts
/**
* Was any information set for this jobInfo? JobInfo can be unset because
* the job never ran or because it already completed.
*/
def isFilledIn = startTime != null
/**
* How long did the job run (in wall time)? Returns -1 if this jobInfo isn't filled in
*/
def getRuntimeInMs: Long = {
if ( isFilledIn )
getDoneTime.getTime - getStartTime.getTime
else
-1
}
override def toString: String =
"started %s ended %s runtime %s".format(getFormattedStartTime, getFormattedDoneTime, getRuntimeInMs)
}
object JobRunInfo {
def default: JobRunInfo = new JobRunInfo()
}

View File

@ -69,6 +69,12 @@ trait JobRunner[TFunction <: QFunction] {
def cleanup() {
}
/**
* Must be overloaded
*/
val runInfo = JobRunInfo.default
def getRunInfo = runInfo
/**
* Calls back to a hook that an expert user can setup to modify a job.
* @param value Value to modify.

View File

@ -38,6 +38,8 @@ import org.apache.commons.lang.StringUtils
import org.broadinstitute.sting.queue.util._
import collection.immutable.{TreeSet, TreeMap}
import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction}
import java.util.Date
import org.broadinstitute.sting.utils.Utils
/**
* The internal dependency tracker between sets of function input and output files.
@ -319,7 +321,10 @@ class QGraph extends Logging {
logger.debug("+++++++")
foreachFunction(readyJobs.toList, edge => {
if (running) {
edge.myRunInfo.startTime = new Date()
edge.getRunInfo.exechosts = Utils.resolveHostname()
logEdge(edge)
edge.myRunInfo.doneTime = new Date()
edge.markAsDone
}
})
@ -939,6 +944,14 @@ class QGraph extends Logging {
edges.sorted(functionOrdering).foreach(edge => if (running) f(edge))
}
/**
* Utility function for running a method over all function edges.
* @param edgeFunction Function to run for each FunctionEdge.
*/
private def getFunctionEdges: List[FunctionEdge] = {
jobGraph.edgeSet.toList.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[List[FunctionEdge]]
}
/**
* Utility function for running a method over all functions, but traversing the nodes in order of dependency.
* @param edgeFunction Function to run for each FunctionEdge.
@ -1028,6 +1041,10 @@ class QGraph extends Logging {
*/
def isShutdown = !running
def getFunctionsAndStatus(functions: List[QFunction]): Map[QFunction, JobRunInfo] = {
getFunctionEdges.map(edge => (edge.function, edge.getRunInfo)).toMap
}
/**
* Kills any forked jobs still running.
*/

View File

@ -26,8 +26,9 @@ package org.broadinstitute.sting.queue.engine
import java.io.File
import org.broadinstitute.sting.queue.QSettings
import org.broadinstitute.sting.commandline.{ArgumentCollection, Argument}
import org.broadinstitute.sting.queue.util.SystemUtils
import org.broadinstitute.sting.commandline.{Advanced, ArgumentCollection, Argument}
import org.broadinstitute.sting.utils.R.RScriptExecutor
/**
* Command line options for a QGraph.
@ -69,6 +70,16 @@ class QGraphSettings {
@Argument(fullName="expanded_dot_graph", shortName="expandedDot", doc="Outputs the queue graph of scatter gather to a .dot file. Otherwise overwrites the dot_graph", required=false)
var expandedDotFile: File = _
@Argument(fullName="jobReport", shortName="jobReport", doc="File where we will write the Queue job report", required=false)
var jobReportFile: String = _
@Advanced
@Argument(fullName="disableJobReport", shortName="disabpleJobReport", doc="If provided, we will not create a job report", required=false)
var disableJobReport: Boolean = false
@ArgumentCollection
var rScriptArgs = new RScriptExecutor.RScriptArgumentCollection
@ArgumentCollection
val qSettings = new QSettings
}

View File

@ -31,11 +31,12 @@ import org.broadinstitute.sting.jna.lsf.v7_0_6.{LibLsf, LibBat}
import org.broadinstitute.sting.utils.Utils
import org.broadinstitute.sting.jna.clibrary.LibC
import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit}
import com.sun.jna.ptr.IntByReference
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
import com.sun.jna.{Structure, StringArray, NativeLong}
import java.util.regex.Pattern
import java.lang.StringBuffer
import java.util.Date
import com.sun.jna.{Pointer, Structure, StringArray, NativeLong}
import com.sun.jna.ptr.{PointerByReference, IntByReference}
/**
* Runs jobs on an LSF compute cluster.
@ -271,12 +272,27 @@ object Lsf706JobRunner extends Logging {
logger.debug("Job Id %s status / exitStatus / exitInfo: 0x%02x / 0x%02x / 0x%02x".format(runner.jobId, jobStatus, exitStatus, exitInfo))
def updateRunInfo() {
// the platform LSF startTimes are in seconds, not milliseconds, so convert to the java convention
runner.getRunInfo.startTime = new Date(jobInfo.startTime.longValue * 1000)
runner.getRunInfo.doneTime = new Date(jobInfo.endTime.longValue * 1000)
val exHostsRaw = jobInfo.exHosts.getStringArray(0)
//logger.warn("exHostsRaw = " + exHostsRaw)
val exHostsList = exHostsRaw.toList
//logger.warn("exHostsList = " + exHostsList)
val exHosts = exHostsList.reduceLeft(_ + "," + _)
//logger.warn("exHosts = " + exHosts)
runner.getRunInfo.exechosts = exHosts
}
runner.updateStatus(
if (Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_DONE)) {
// Done successfully.
updateRunInfo()
RunnerStatus.DONE
} else if (Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_EXIT) && !willRetry(exitInfo, endTime)) {
// Exited function that (probably) won't be retried.
updateRunInfo()
RunnerStatus.FAILED
} else {
// Note that we still saw the job in the system.

View File

@ -27,6 +27,9 @@ package org.broadinstitute.sting.queue.engine.shell
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.util.ShellJob
import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner}
import java.util.Date
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
import org.broadinstitute.sting.utils.Utils
/**
* Runs jobs one at a time locally
@ -50,8 +53,11 @@ class ShellJobRunner(val function: CommandLineFunction) extends CommandLineJobRu
// Allow advanced users to update the job.
updateJobRun(job)
getRunInfo.startTime = new Date()
getRunInfo.exechosts = Utils.resolveHostname()
updateStatus(RunnerStatus.RUNNING)
job.run()
getRunInfo.doneTime = new Date()
updateStatus(RunnerStatus.DONE)
}

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.queue.extensions.picard
import org.broadinstitute.sting.commandline._
import java.io.File
/*
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 6/22/11
* Time: 10:35 AM
*/
class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
analysisName = "ReorderSam"
javaMainClass = "net.sf.picard.sam.ReorderSam"
@Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true)
var input: List[File] = Nil
@Output(doc="Output file (bam or sam) to write extracted reads to.", shortName = "output", fullName = "output_bam_file", required = true)
var output: File = _
@Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false)
var outputIndex: File = _
@Argument(doc="Reference sequence to reorder reads to match.", shortName = "ref", fullName = "sort_reference", required = true)
var sortReference: File = _
@Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false)
var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = _
@Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false)
var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = _
override def freezeFieldValues() {
super.freezeFieldValues()
if (outputIndex == null && output != null)
outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
}
override def inputBams = input
override def outputBam = output
this.createIndex = Some(true)
this.sortOrder = null
override def commandLine = super.commandLine +
" REFERENCE=" + sortReference +
optional(" ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE)
optional(" ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE)
}

View File

@ -30,14 +30,14 @@ import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.{QException, QSettings}
import collection.JavaConversions._
import org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction
import org.broadinstitute.sting.queue.util.{Logging, CollectionUtils, IOUtils, ReflectionUtils}
import org.broadinstitute.sting.queue.util._
/**
* The base interface for all functions in Queue.
* Inputs and outputs are specified as Sets of values.
* Inputs are matched to other outputs by using .equals()
*/
trait QFunction extends Logging {
trait QFunction extends Logging with QJobReport {
/** A short description of this step in the graph */
var analysisName: String = "<function>"
@ -83,11 +83,17 @@ trait QFunction extends Logging {
*/
var deleteIntermediateOutputs = true
// -------------------------------------------------------
//
// job run information
//
// -------------------------------------------------------
/**
* Copies settings from this function to another function.
* @param function QFunction to copy values to.
*/
def copySettingsTo(function: QFunction) {
override def copySettingsTo(function: QFunction) {
function.analysisName = this.analysisName
function.jobName = this.jobName
function.qSettings = this.qSettings
@ -99,6 +105,8 @@ trait QFunction extends Logging {
function.updateJobRun = this.updateJobRun
function.isIntermediate = this.isIntermediate
function.deleteIntermediateOutputs = this.deleteIntermediateOutputs
function.reportGroup = this.reportGroup
function.reportFeatures = this.reportFeatures
}
/** File to redirect any output. Defaults to <jobName>.out */

View File

@ -0,0 +1,164 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.util
import org.broadinstitute.sting.queue.function.QFunction
import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport}
import org.broadinstitute.sting.utils.exceptions.UserException
import org.broadinstitute.sting.queue.engine.JobRunInfo
import java.io.{FileOutputStream, PrintStream, File}
import org.broadinstitute.sting.queue.function.scattergather.{GathererFunction, ScatterFunction}
import org.broadinstitute.sting.utils.R.RScriptExecutor.RScriptArgumentCollection
import org.broadinstitute.sting.utils.R.RScriptExecutor
import org.broadinstitute.sting.queue.QScript
/**
* A mixin to add Job info to the class
*/
trait QJobReport extends Logging {
self: QFunction =>
protected var reportGroup: String = null
protected var reportFeatures: Map[String, String] = Map()
protected var reportEnabled: Boolean = true
def includeInReport = reportEnabled
def enableReport() { reportEnabled = true }
def disableReport() { reportEnabled = false }
def setRunInfo(info: JobRunInfo) {
//logger.info("info " + info)
reportFeatures = Map(
"iteration" -> 1,
"analysisName" -> getReportGroup,
"jobName" -> QJobReport.workAroundSameJobNames(this),
"intermediate" -> self.isIntermediate,
"exechosts" -> info.getExecHosts,
"startTime" -> info.getStartTime.getTime,
"doneTime" -> info.getDoneTime.getTime,
"formattedStartTime" -> info.getFormattedStartTime,
"formattedDoneTime" -> info.getFormattedDoneTime,
"runtime" -> info.getRuntimeInMs).mapValues((x:Any) => if (x != null) x.toString else "null") ++ reportFeatures
// note -- by adding reportFeatures second we override iteration
// (or any other binding) with the user provided value
}
/** The report Group is the analysis name transform to only contain valid GATKReportTable characters */
def getReportGroup = self.analysisName.replaceAll(GATKReportTable.INVALID_TABLE_NAME_REGEX, "_")
def getReportFeatures = reportFeatures
def getReportFeatureNames: List[String] = getReportFeatures.keys.toList
def getReportFeature(key: String): String = {
getReportFeatures.get(key) match {
case Some(x) => x
case None => throw new RuntimeException("Get called with key %s but no value was found".format(key))
}
}
def getReportName: String = getReportFeature("jobName")
def configureJobReport(features: Map[String, Any]) {
this.reportFeatures = features.mapValues(_.toString)
}
// copy the QJobReport information -- todo : what's the best way to do this?
override def copySettingsTo(function: QFunction) {
self.copySettingsTo(function)
function.reportFeatures = this.reportFeatures
}
}
object QJobReport {
val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R"
// todo -- fixme to have a unique name for Scatter/gather jobs as well
var seenCounter = 1
var seenNames = Set[String]()
def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) {
val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport)
jobs foreach {case (qf, info) => qf.setRunInfo(info)}
val stream = new PrintStream(new FileOutputStream(dest))
printJobLogging(jobs.keys.toList, stream)
stream.close()
}
def plotReport(args: RScriptArgumentCollection, jobReportFile: File) {
val executor = new RScriptExecutor(args, false) // don't except on error
val pdf = jobReportFile.getAbsolutePath + ".pdf"
executor.callRScripts(JOB_REPORT_QUEUE_SCRIPT, jobReportFile.getAbsolutePath, pdf)
}
def workAroundSameJobNames(func: QFunction):String = {
if ( seenNames.apply(func.jobName) ) {
seenCounter += 1
"%s_%d".format(func.jobName, seenCounter)
} else {
seenNames += func.jobName
func.jobName
}
}
/**
* Prints the JobLogging logs to a GATKReport. First splits up the
* logs by group, and for each group generates a GATKReportTable
*/
private def printJobLogging(logs: List[QFunction], stream: PrintStream) {
// create the report
val report: GATKReport = new GATKReport
// create a table for each group of logs
for ( (group, groupLogs) <- groupLogs(logs) ) {
report.addTable(group, "Job logs for " + group)
val table: GATKReportTable = report.getTable(group)
table.addPrimaryKey("jobName", false)
val keys = logKeys(groupLogs)
// add the columns
keys.foreach(table.addColumn(_, 0))
for (log <- groupLogs) {
for ( key <- keys )
table.set(log.getReportName, key, log.getReportFeature(key))
}
}
report.print(stream)
}
private def groupLogs(logs: List[QFunction]): Map[String, List[QFunction]] = {
logs.groupBy(_.getReportGroup)
}
private def logKeys(logs: List[QFunction]): Set[String] = {
// the keys should be the same for each log, but we will check that
val keys = Set[String](logs(0).getReportFeatureNames : _*)
for ( log <- logs )
if ( keys.sameElements(Set(log.getReportFeatureNames)) )
throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " +
"We found one with %s and another with %s").format(keys, log.getReportFeatureNames))
keys
}
}

View File

@ -57,4 +57,6 @@ object QScriptUtils {
}
def ?[A <: AnyRef](ref: A): Option[A] =
if (ref eq null) None else Some(ref)
}

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="edu.mit.broad" module="picard-private-parts" revision="1959" status="integration" publication="20110718185300" />
<info organisation="edu.mit.broad" module="picard-private-parts" revision="2034" status="integration" publication="20110718185300" />
</ivy-module>

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.49.895" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.52.944" status="release" />
</ivy-module>

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.49.895" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.52.944" status="release" />
</ivy-module>

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broad" module="tribble" revision="18" status="integration" />
<info organisation="org.broad" module="tribble" revision="21" status="integration" />
</ivy-module>